From dd93356697c0da75d7d0b9e637da7fb919f31ebf Mon Sep 17 00:00:00 2001 From: Christudasan Devadasan Date: Thu, 20 Jun 2024 10:02:29 +0000 Subject: [PATCH 1/8] [AMDGPU][SILoadStoreOptimizer] Merge constrained sloads Consider the constrained multi-dword loads while merging individual loads to a single multi-dword load. --- llvm/lib/Target/AMDGPU/GCNSubtarget.h | 1 + .../Target/AMDGPU/SILoadStoreOptimizer.cpp | 79 +- .../AMDGPU/GlobalISel/cvt_f32_ubyte.ll | 76 +- .../AMDGPU/GlobalISel/fp-atomics-gfx940.ll | 49 +- .../llvm.amdgcn.global.atomic.csub.ll | 24 +- .../GlobalISel/llvm.amdgcn.intersect_ray.ll | 141 +- .../GlobalISel/llvm.amdgcn.set.inactive.ll | 86 +- .../GlobalISel/llvm.amdgcn.update.dpp.ll | 105 +- .../test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll | 886 +- .../test/CodeGen/AMDGPU/GlobalISel/udivrem.ll | 566 +- llvm/test/CodeGen/AMDGPU/add.v2i16.ll | 234 +- .../AMDGPU/amdgpu-codegenprepare-idiv.ll | 3255 +++-- llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll | 48 +- llvm/test/CodeGen/AMDGPU/bfe-patterns.ll | 56 +- llvm/test/CodeGen/AMDGPU/bfm.ll | 16 +- llvm/test/CodeGen/AMDGPU/bitreverse.ll | 162 +- llvm/test/CodeGen/AMDGPU/build_vector.ll | 84 +- .../CodeGen/AMDGPU/calling-conventions.ll | 212 +- llvm/test/CodeGen/AMDGPU/cluster_stores.ll | 96 +- .../CodeGen/AMDGPU/combine-cond-add-sub.ll | 242 +- llvm/test/CodeGen/AMDGPU/ctlz.ll | 394 +- llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll | 505 +- llvm/test/CodeGen/AMDGPU/ctpop16.ll | 88 +- llvm/test/CodeGen/AMDGPU/ctpop64.ll | 124 +- llvm/test/CodeGen/AMDGPU/cttz.ll | 296 +- llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll | 266 +- llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll | 511 +- .../AMDGPU/divergence-driven-buildvector.ll | 176 +- llvm/test/CodeGen/AMDGPU/ds_read2.ll | 243 +- .../CodeGen/AMDGPU/extract_vector_elt-f16.ll | 213 +- llvm/test/CodeGen/AMDGPU/fabs.f16.ll | 167 +- llvm/test/CodeGen/AMDGPU/fabs.ll | 112 +- llvm/test/CodeGen/AMDGPU/fcanonicalize.ll | 512 +- llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll | 659 +- llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll | 337 +- llvm/test/CodeGen/AMDGPU/fdiv.ll | 358 +- llvm/test/CodeGen/AMDGPU/flat_atomics.ll | 3322 ++--- .../CodeGen/AMDGPU/flat_atomics_i32_system.ll | 306 +- llvm/test/CodeGen/AMDGPU/fma-combine.ll | 788 +- .../AMDGPU/fmul-2-combine-multi-use.ll | 324 +- llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll | 500 +- llvm/test/CodeGen/AMDGPU/fnearbyint.ll | 123 +- llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll | 72 +- llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll | 178 +- llvm/test/CodeGen/AMDGPU/fneg-fabs.ll | 48 +- llvm/test/CodeGen/AMDGPU/fneg.ll | 218 +- .../test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll | 50 +- llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll | 376 +- llvm/test/CodeGen/AMDGPU/fp-classify.ll | 322 +- .../AMDGPU/fp-min-max-buffer-atomics.ll | 263 +- .../AMDGPU/fp-min-max-buffer-ptr-atomics.ll | 249 +- llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll | 6 +- llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll | 6 +- llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll | 6 +- .../AMDGPU/fp64-min-max-buffer-atomics.ll | 148 +- .../AMDGPU/fp64-min-max-buffer-ptr-atomics.ll | 148 +- llvm/test/CodeGen/AMDGPU/fp_to_sint.ll | 118 +- llvm/test/CodeGen/AMDGPU/fp_to_uint.ll | 104 +- llvm/test/CodeGen/AMDGPU/fshl.ll | 378 +- llvm/test/CodeGen/AMDGPU/fshr.ll | 218 +- llvm/test/CodeGen/AMDGPU/global_atomics.ll | 2714 ++-- .../AMDGPU/global_atomics_i32_system.ll | 332 +- llvm/test/CodeGen/AMDGPU/half.ll | 366 +- .../CodeGen/AMDGPU/insert_vector_dynelt.ll | 734 +- .../insert_waitcnt_for_precise_memory.ll | 424 +- llvm/test/CodeGen/AMDGPU/kernel-args.ll | 933 +- .../CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll | 202 +- .../CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll | 1136 +- .../CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll | 1088 +- .../AMDGPU/llvm.amdgcn.global.atomic.csub.ll | 8 +- .../CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll | 746 +- .../CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll | 712 +- .../CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll | 20 +- .../AMDGPU/llvm.amdgcn.intersect_ray.ll | 105 +- .../CodeGen/AMDGPU/llvm.amdgcn.permlane.ll | 10386 ++-------------- .../AMDGPU/llvm.amdgcn.permlane16.var.ll | 224 +- .../CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll | 117 +- .../AMDGPU/llvm.amdgcn.s.barrier.wait.ll | 363 +- .../llvm.amdgcn.sched.group.barrier.gfx11.ll | 28 +- .../llvm.amdgcn.sched.group.barrier.gfx12.ll | 142 +- .../AMDGPU/llvm.amdgcn.sched.group.barrier.ll | 432 +- .../AMDGPU/llvm.amdgcn.set.inactive.ll | 68 +- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll | 280 +- llvm/test/CodeGen/AMDGPU/llvm.exp.ll | 238 +- llvm/test/CodeGen/AMDGPU/llvm.exp10.ll | 238 +- llvm/test/CodeGen/AMDGPU/llvm.exp2.ll | 149 +- llvm/test/CodeGen/AMDGPU/llvm.log.ll | 254 +- llvm/test/CodeGen/AMDGPU/llvm.log10.ll | 254 +- llvm/test/CodeGen/AMDGPU/llvm.log2.ll | 290 +- .../AMDGPU/llvm.r600.read.local.size.ll | 146 +- llvm/test/CodeGen/AMDGPU/llvm.round.ll | 558 +- llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll | 166 +- llvm/test/CodeGen/AMDGPU/madak.ll | 596 +- llvm/test/CodeGen/AMDGPU/memory_clause.ll | 90 +- llvm/test/CodeGen/AMDGPU/merge-s-load.mir | 180 +- llvm/test/CodeGen/AMDGPU/min.ll | 830 +- .../CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll | 160 +- llvm/test/CodeGen/AMDGPU/mul.ll | 888 +- llvm/test/CodeGen/AMDGPU/mul_int24.ll | 278 +- llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll | 338 +- llvm/test/CodeGen/AMDGPU/or.ll | 376 +- llvm/test/CodeGen/AMDGPU/packed-op-sel.ll | 72 +- llvm/test/CodeGen/AMDGPU/preload-kernargs.ll | 3080 +++-- .../AMDGPU/ptr-buffer-alias-scheduling.ll | 56 +- llvm/test/CodeGen/AMDGPU/rotl.ll | 66 +- llvm/test/CodeGen/AMDGPU/rotr.ll | 48 +- llvm/test/CodeGen/AMDGPU/shl.v2i16.ll | 180 +- llvm/test/CodeGen/AMDGPU/sign_extend.ll | 100 +- llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll | 98 +- llvm/test/CodeGen/AMDGPU/sub.ll | 198 +- llvm/test/CodeGen/AMDGPU/sub.v2i16.ll | 230 +- llvm/test/CodeGen/AMDGPU/udiv.ll | 112 +- llvm/test/CodeGen/AMDGPU/v_cndmask.ll | 748 +- llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll | 136 +- llvm/test/CodeGen/AMDGPU/wave32.ll | 355 +- llvm/test/CodeGen/AMDGPU/xor.ll | 276 +- 116 files changed, 22445 insertions(+), 30549 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index def89c785b855..6ccee799a671e 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -1012,6 +1012,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool hasLDSFPAtomicAddF32() const { return GFX8Insts; } bool hasLDSFPAtomicAddF64() const { return GFX90AInsts; } + bool hasXnackReplay() const { return GFX8Insts; } /// \returns true if the subtarget has the v_permlanex16_b32 instruction. bool hasPermLaneX16() const { return getGeneration() >= GFX10; } diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index 8b42d4a1dee7a..0b285d52b539e 100644 --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -216,7 +216,8 @@ class SILoadStoreOptimizer : public MachineFunctionPass { CombineInfo &Paired, bool Modify = false); static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI, const CombineInfo &Paired); - static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired); + static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired, + const GCNSubtarget *STI = nullptr); static std::pair getSubRegIdxs(const CombineInfo &CI, const CombineInfo &Paired); const TargetRegisterClass * @@ -343,6 +344,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: case AMDGPU::S_LOAD_DWORD_IMM: + case AMDGPU::S_LOAD_DWORD_IMM_ec: case AMDGPU::GLOBAL_LOAD_DWORD: case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: case AMDGPU::GLOBAL_STORE_DWORD: @@ -353,6 +355,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: case AMDGPU::S_LOAD_DWORDX2_IMM: + case AMDGPU::S_LOAD_DWORDX2_IMM_ec: case AMDGPU::GLOBAL_LOAD_DWORDX2: case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: case AMDGPU::GLOBAL_STORE_DWORDX2: @@ -363,6 +366,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: case AMDGPU::S_LOAD_DWORDX3_IMM: + case AMDGPU::S_LOAD_DWORDX3_IMM_ec: case AMDGPU::GLOBAL_LOAD_DWORDX3: case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: case AMDGPU::GLOBAL_STORE_DWORDX3: @@ -373,6 +377,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: case AMDGPU::S_LOAD_DWORDX4_IMM: + case AMDGPU::S_LOAD_DWORDX4_IMM_ec: case AMDGPU::GLOBAL_LOAD_DWORDX4: case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: case AMDGPU::GLOBAL_STORE_DWORDX4: @@ -383,6 +388,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: case AMDGPU::S_LOAD_DWORDX8_IMM: + case AMDGPU::S_LOAD_DWORDX8_IMM_ec: return 8; case AMDGPU::DS_READ_B32: case AMDGPU::DS_READ_B32_gfx9: @@ -507,6 +513,11 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { case AMDGPU::S_LOAD_DWORDX3_IMM: case AMDGPU::S_LOAD_DWORDX4_IMM: case AMDGPU::S_LOAD_DWORDX8_IMM: + case AMDGPU::S_LOAD_DWORD_IMM_ec: + case AMDGPU::S_LOAD_DWORDX2_IMM_ec: + case AMDGPU::S_LOAD_DWORDX3_IMM_ec: + case AMDGPU::S_LOAD_DWORDX4_IMM_ec: + case AMDGPU::S_LOAD_DWORDX8_IMM_ec: return S_LOAD_IMM; case AMDGPU::DS_READ_B32: case AMDGPU::DS_READ_B32_gfx9: @@ -591,6 +602,11 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) { case AMDGPU::S_LOAD_DWORDX3_IMM: case AMDGPU::S_LOAD_DWORDX4_IMM: case AMDGPU::S_LOAD_DWORDX8_IMM: + case AMDGPU::S_LOAD_DWORD_IMM_ec: + case AMDGPU::S_LOAD_DWORDX2_IMM_ec: + case AMDGPU::S_LOAD_DWORDX3_IMM_ec: + case AMDGPU::S_LOAD_DWORDX4_IMM_ec: + case AMDGPU::S_LOAD_DWORDX8_IMM_ec: return AMDGPU::S_LOAD_DWORD_IMM; case AMDGPU::GLOBAL_LOAD_DWORD: case AMDGPU::GLOBAL_LOAD_DWORDX2: @@ -703,6 +719,11 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) { case AMDGPU::S_LOAD_DWORDX3_IMM: case AMDGPU::S_LOAD_DWORDX4_IMM: case AMDGPU::S_LOAD_DWORDX8_IMM: + case AMDGPU::S_LOAD_DWORD_IMM_ec: + case AMDGPU::S_LOAD_DWORDX2_IMM_ec: + case AMDGPU::S_LOAD_DWORDX3_IMM_ec: + case AMDGPU::S_LOAD_DWORDX4_IMM_ec: + case AMDGPU::S_LOAD_DWORDX8_IMM_ec: Result.SBase = true; return Result; case AMDGPU::DS_READ_B32: @@ -1212,8 +1233,17 @@ void SILoadStoreOptimizer::copyToDestRegs( // Copy to the old destination registers. const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); - const auto *Dest0 = TII->getNamedOperand(*CI.I, OpName); - const auto *Dest1 = TII->getNamedOperand(*Paired.I, OpName); + auto *Dest0 = TII->getNamedOperand(*CI.I, OpName); + auto *Dest1 = TII->getNamedOperand(*Paired.I, OpName); + + // The constrained sload instructions in S_LOAD_IMM class will have + // `early-clobber` flag in the dst operand. Remove the flag before using the + // MOs in copies. + if (Dest0->isEarlyClobber()) + Dest0->setIsEarlyClobber(false); + + if (Dest1->isEarlyClobber()) + Dest1->setIsEarlyClobber(false); BuildMI(*MBB, InsertBefore, DL, CopyDesc) .add(*Dest0) // Copy to same destination including flags and sub reg. @@ -1446,7 +1476,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair( MachineBasicBlock::iterator InsertBefore) { MachineBasicBlock *MBB = CI.I->getParent(); DebugLoc DL = CI.I->getDebugLoc(); - const unsigned Opcode = getNewOpcode(CI, Paired); + const unsigned Opcode = getNewOpcode(CI, Paired, STM); const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); @@ -1658,7 +1688,8 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair( } unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, - const CombineInfo &Paired) { + const CombineInfo &Paired, + const GCNSubtarget *STI) { const unsigned Width = CI.Width + Paired.Width; switch (getCommonInstClass(CI, Paired)) { @@ -1701,17 +1732,33 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM; } case S_LOAD_IMM: - switch (Width) { - default: - return 0; - case 2: - return AMDGPU::S_LOAD_DWORDX2_IMM; - case 3: - return AMDGPU::S_LOAD_DWORDX3_IMM; - case 4: - return AMDGPU::S_LOAD_DWORDX4_IMM; - case 8: - return AMDGPU::S_LOAD_DWORDX8_IMM; + // For targets that support XNACK replay, use the constrained load opcode. + if (STI && STI->hasXnackReplay()) { + switch (Width) { + default: + return 0; + case 2: + return AMDGPU::S_LOAD_DWORDX2_IMM_ec; + case 3: + return AMDGPU::S_LOAD_DWORDX3_IMM_ec; + case 4: + return AMDGPU::S_LOAD_DWORDX4_IMM_ec; + case 8: + return AMDGPU::S_LOAD_DWORDX8_IMM_ec; + } + } else { + switch (Width) { + default: + return 0; + case 2: + return AMDGPU::S_LOAD_DWORDX2_IMM; + case 3: + return AMDGPU::S_LOAD_DWORDX3_IMM; + case 4: + return AMDGPU::S_LOAD_DWORDX4_IMM; + case 8: + return AMDGPU::S_LOAD_DWORDX8_IMM; + } } case GLOBAL_LOAD: switch (Width) { diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll index 405b1e8f3a250..eb20178f9f4d8 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll @@ -452,7 +452,7 @@ define double @v_uitofp_i8_to_f64(i8 %arg0) nounwind { define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_i8_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 @@ -468,7 +468,7 @@ define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr add ; ; VI-LABEL: load_i8_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s2 @@ -493,7 +493,7 @@ define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr add define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v2i8_to_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -513,7 +513,7 @@ define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v2i8_to_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -539,7 +539,7 @@ define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v3i8_to_v3f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -562,7 +562,7 @@ define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v3i8_to_v3f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -589,7 +589,7 @@ define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v4i8_to_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -612,7 +612,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v4i8_to_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -644,7 +644,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v4i8_to_v4f32_unaligned: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -679,7 +679,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias ; ; VI-LABEL: load_v4i8_to_v4f32_unaligned: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -725,14 +725,14 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %out2, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v4i8_to_v4f32_2_uses: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b64 s[4:5], s[0:1] @@ -769,17 +769,17 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; ; VI-LABEL: load_v4i8_to_v4f32_2_uses: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: v_mov_b32_e32 v6, 9 ; VI-NEXT: v_mov_b32_e32 v7, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v1, v[0:1] -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0xff ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v5, s1 @@ -821,7 +821,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v7i8_to_v7f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -858,7 +858,7 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v7i8_to_v7f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -918,7 +918,7 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v8i8_to_v8f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -949,7 +949,7 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v8i8_to_v8f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -986,7 +986,7 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: i8_zext_inreg_i32_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -1005,7 +1005,7 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %ou ; ; VI-LABEL: i8_zext_inreg_i32_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -1033,7 +1033,7 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %ou define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: i8_zext_inreg_hi1_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -1051,7 +1051,7 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %ou ; ; VI-LABEL: i8_zext_inreg_hi1_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -1080,7 +1080,7 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %ou define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: i8_zext_i32_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 @@ -1096,7 +1096,7 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: i8_zext_i32_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s2 @@ -1122,7 +1122,7 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v4i8_zext_v4i32_to_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -1157,7 +1157,7 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou ; ; VI-LABEL: v4i8_zext_v4i32_to_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -1204,7 +1204,7 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: extract_byte0_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -1221,7 +1221,7 @@ define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, p ; ; VI-LABEL: extract_byte0_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -1247,7 +1247,7 @@ define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: extract_byte1_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -1265,7 +1265,7 @@ define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, p ; ; VI-LABEL: extract_byte1_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -1292,7 +1292,7 @@ define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: extract_byte2_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -1310,7 +1310,7 @@ define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, p ; ; VI-LABEL: extract_byte2_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -1337,7 +1337,7 @@ define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: extract_byte3_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -1354,7 +1354,7 @@ define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, p ; ; VI-LABEL: extract_byte3_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -1381,7 +1381,7 @@ define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: cvt_ubyte0_or_multiuse: ; SI: ; %bb.0: ; %bb -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -1401,7 +1401,7 @@ define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addr ; ; VI-LABEL: cvt_ubyte0_or_multiuse: ; VI: ; %bb.0: ; %bb -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll index 2d3b6ee3e9823..a018ea5bf18f1 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll @@ -13,10 +13,10 @@ declare <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> % define amdgpu_kernel void @flat_atomic_fadd_f32_noret(ptr %ptr, float %data) { ; GFX940-LABEL: flat_atomic_fadd_f32_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX940-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 ; GFX940-NEXT: s_endpgm @@ -27,7 +27,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret(ptr %ptr, float %data) { define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) { ; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -43,7 +43,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) { define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 { ; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat_ieee: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -84,10 +84,10 @@ define float @flat_atomic_fadd_f32_rtn_pat(ptr %ptr, float %data) { define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(ptr %ptr, <2 x half> %data) { ; GFX940-LABEL: flat_atomic_fadd_v2f16_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX940-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 ; GFX940-NEXT: s_endpgm @@ -109,10 +109,10 @@ define <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) { define amdgpu_kernel void @flat_atomic_fadd_v2bf16_noret(ptr %ptr, <2 x i16> %data) { ; GFX940-LABEL: flat_atomic_fadd_v2bf16_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX940-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 ; GFX940-NEXT: s_endpgm @@ -134,12 +134,12 @@ define <2 x i16> @flat_atomic_fadd_v2bf16_rtn(ptr %ptr, <2 x i16> %data) { define amdgpu_kernel void @global_atomic_fadd_v2bf16_noret(ptr addrspace(1) %ptr, <2 x i16> %data) { ; GFX940-LABEL: global_atomic_fadd_v2bf16_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-NEXT: global_atomic_pk_add_bf16 v1, v0, s[0:1] +; GFX940-NEXT: global_atomic_pk_add_bf16 v1, v0, s[2:3] ; GFX940-NEXT: s_endpgm %ret = call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data) ret void @@ -159,12 +159,11 @@ define <2 x i16> @global_atomic_fadd_v2bf16_rtn(ptr addrspace(1) %ptr, <2 x i16> define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, <2 x half> %data) { ; GFX940-LABEL: local_atomic_fadd_v2f16_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s0 -; GFX940-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-NEXT: v_mov_b32_e32 v1, s3 ; GFX940-NEXT: ds_pk_add_f16 v0, v1 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: s_endpgm %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0) ret void @@ -184,12 +183,14 @@ define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half> define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, <2 x i16> %data) { ; GFX940-LABEL: local_atomic_fadd_v2bf16_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s0 -; GFX940-NEXT: v_mov_b32_e32 v1, s1 -; GFX940-NEXT: ds_pk_add_f16 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v0, s3 +; GFX940-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: ds_pk_add_bf16 v1, v0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: s_endpgm %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data) ret void @@ -199,8 +200,10 @@ define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16> ; GFX940-LABEL: local_atomic_fadd_v2bf16_rtn: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ds_pk_add_rtn_f16 v0, v0, v1 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data) ret <2 x i16> %ret diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll index 59818b0b1bc39..ade6e55b482bb 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll @@ -134,8 +134,8 @@ define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset(ptr addrspace(1) ; GFX10-LABEL: global_atomic_csub_sgpr_base_offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x1000 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s2 @@ -147,10 +147,10 @@ define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset(ptr addrspace(1) ; GFX11-LABEL: global_atomic_csub_sgpr_base_offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0x1000 :: v_dual_mov_b32 v0, s4 +; GFX11-NEXT: v_dual_mov_b32 v1, 0x1000 :: v_dual_mov_b32 v0, s2 ; GFX11-NEXT: global_atomic_csub_u32 v0, v1, v0, s[0:1] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v[0:1], v0, off @@ -160,7 +160,7 @@ define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset(ptr addrspace(1) ; ; GFX12-LABEL: global_atomic_csub_sgpr_base_offset: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x0 +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v1, v0, s[0:1] offset:4096 th:TH_ATOMIC_RETURN @@ -179,8 +179,8 @@ define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset_nortn(ptr addrspa ; GFX10-LABEL: global_atomic_csub_sgpr_base_offset_nortn: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x1000 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s2 @@ -190,16 +190,16 @@ define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset_nortn(ptr addrspa ; GFX11-LABEL: global_atomic_csub_sgpr_base_offset_nortn: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0x1000 :: v_dual_mov_b32 v0, s4 +; GFX11-NEXT: v_dual_mov_b32 v1, 0x1000 :: v_dual_mov_b32 v0, s2 ; GFX11-NEXT: global_atomic_csub_u32 v0, v1, v0, s[0:1] glc ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_atomic_csub_sgpr_base_offset_nortn: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x0 +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v1, v0, s[0:1] offset:4096 th:TH_ATOMIC_RETURN diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll index 69f9a5712b0b5..ce402fb4e4abc 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll @@ -628,7 +628,7 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> inreg %tdescr) { ; GFX1030-LABEL: image_bvh_intersect_ray_nsa_reassign: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX1030-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX1030-NEXT: v_mov_b32_e32 v5, 0x40400000 ; GFX1030-NEXT: v_mov_b32_e32 v6, 4.0 @@ -658,17 +658,17 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; ; GFX1013-LABEL: image_bvh_intersect_ray_nsa_reassign: ; GFX1013: ; %bb.0: -; GFX1013-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1013-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GFX1013-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; GFX1013-NEXT: v_mov_b32_e32 v7, 0x40a00000 ; GFX1013-NEXT: v_mov_b32_e32 v8, 0x40c00000 ; GFX1013-NEXT: v_mov_b32_e32 v9, 0x40e00000 ; GFX1013-NEXT: v_mov_b32_e32 v10, 0x41000000 ; GFX1013-NEXT: s_waitcnt lgkmcnt(0) -; GFX1013-NEXT: v_mov_b32_e32 v0, s0 -; GFX1013-NEXT: v_mov_b32_e32 v1, s1 -; GFX1013-NEXT: v_mov_b32_e32 v2, s2 -; GFX1013-NEXT: v_mov_b32_e32 v3, s3 +; GFX1013-NEXT: v_mov_b32_e32 v0, s4 +; GFX1013-NEXT: v_mov_b32_e32 v1, s5 +; GFX1013-NEXT: v_mov_b32_e32 v2, s6 +; GFX1013-NEXT: v_mov_b32_e32 v3, s7 ; GFX1013-NEXT: v_add_co_u32 v4, vcc_lo, v0, v6 ; GFX1013-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo ; GFX1013-NEXT: v_add_co_u32 v2, vcc_lo, v2, v6 @@ -681,40 +681,40 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; GFX1013-NEXT: v_mov_b32_e32 v4, 2.0 ; GFX1013-NEXT: v_mov_b32_e32 v5, 0x40400000 ; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:10], s[4:7] +; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:10], s[8:11] ; GFX1013-NEXT: s_waitcnt vmcnt(0) ; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3] ; GFX1013-NEXT: s_endpgm ; ; GFX11-LABEL: image_bvh_intersect_ray_nsa_reassign: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_mov_b32 s10, 0x40a00000 -; GFX11-NEXT: s_mov_b32 s9, 4.0 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX11-NEXT: s_mov_b32 s8, 0x40400000 ; GFX11-NEXT: s_mov_b32 s12, 0x40c00000 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; GFX11-NEXT: s_mov_b32 s10, 0x40a00000 +; GFX11-NEXT: s_mov_b32 s9, 4.0 ; GFX11-NEXT: s_mov_b32 s14, 0x41000000 ; GFX11-NEXT: s_mov_b32 s13, 0x40e00000 ; GFX11-NEXT: v_mov_b32_e32 v6, s12 ; GFX11-NEXT: v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v7, s13 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: s_mov_b32 s2, 2.0 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: s_mov_b32 s1, 1.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX11-NEXT: flat_load_b32 v9, v[0:1] ; GFX11-NEXT: flat_load_b32 v10, v[2:3] +; GFX11-NEXT: s_mov_b32 s2, 2.0 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s8 ; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 -; GFX11-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v5, s10 +; GFX11-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v4, s9 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v9, v10, v[0:2], v[3:5], v[6:8]], s[4:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -742,7 +742,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> inreg %tdescr) { ; GFX1030-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX1030-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX1030-NEXT: v_mov_b32_e32 v5, 0x44004200 ; GFX1030-NEXT: v_mov_b32_e32 v6, 0x46004500 @@ -769,14 +769,14 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ ; ; GFX1013-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: ; GFX1013: ; %bb.0: -; GFX1013-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1013-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GFX1013-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; GFX1013-NEXT: v_mov_b32_e32 v7, 0x48004700 ; GFX1013-NEXT: s_waitcnt lgkmcnt(0) -; GFX1013-NEXT: v_mov_b32_e32 v0, s0 -; GFX1013-NEXT: v_mov_b32_e32 v1, s1 -; GFX1013-NEXT: v_mov_b32_e32 v2, s2 -; GFX1013-NEXT: v_mov_b32_e32 v3, s3 +; GFX1013-NEXT: v_mov_b32_e32 v0, s4 +; GFX1013-NEXT: v_mov_b32_e32 v1, s5 +; GFX1013-NEXT: v_mov_b32_e32 v2, s6 +; GFX1013-NEXT: v_mov_b32_e32 v3, s7 ; GFX1013-NEXT: v_add_co_u32 v4, vcc_lo, v0, v6 ; GFX1013-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo ; GFX1013-NEXT: v_add_co_u32 v2, vcc_lo, v2, v6 @@ -789,36 +789,35 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ ; GFX1013-NEXT: v_mov_b32_e32 v4, 2.0 ; GFX1013-NEXT: v_mov_b32_e32 v5, 0x44004200 ; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[4:7] a16 +; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[8:11] a16 ; GFX1013-NEXT: s_waitcnt vmcnt(0) ; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3] ; GFX1013-NEXT: s_endpgm ; ; GFX11-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX11-NEXT: s_mov_b32 s8, 0x42004600 ; GFX11-NEXT: s_mov_b32 s9, 0x44004700 ; GFX11-NEXT: s_mov_b32 s10, 0x45004800 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: s_mov_b32 s2, 2.0 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: s_mov_b32 s1, 1.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX11-NEXT: flat_load_b32 v6, v[0:1] ; GFX11-NEXT: flat_load_b32 v7, v[2:3] +; GFX11-NEXT: s_mov_b32 s2, 2.0 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s8 ; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 -; GFX11-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v5, s10 +; GFX11-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v4, s9 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v6, v7, v[0:2], v[3:5]], s[4:7] a16 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -847,8 +846,8 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 ; GFX1030-LABEL: image_bvh64_intersect_ray_nsa_reassign: ; GFX1030: ; %bb.0: ; GFX1030-NEXT: s_clause 0x1 -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x34 +; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 ; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX1030-NEXT: v_mov_b32_e32 v3, 0 ; GFX1030-NEXT: v_mov_b32_e32 v4, 1.0 @@ -866,7 +865,7 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 ; GFX1030-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX1030-NEXT: flat_load_dword v2, v[0:1] ; GFX1030-NEXT: v_mov_b32_e32 v0, 0xb36211c7 -; GFX1030-NEXT: v_bfrev_b32_e32 v1, 4.0 +; GFX1030-NEXT: v_mov_b32_e32 v1, 0x102 ; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[0:3] ; GFX1030-NEXT: s_waitcnt vmcnt(0) @@ -876,8 +875,8 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 ; GFX1013-LABEL: image_bvh64_intersect_ray_nsa_reassign: ; GFX1013: ; %bb.0: ; GFX1013-NEXT: s_clause 0x1 -; GFX1013-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1013-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX1013-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX1013-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX1013-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX1013-NEXT: v_mov_b32_e32 v3, 0 ; GFX1013-NEXT: v_mov_b32_e32 v4, 1.0 @@ -889,13 +888,13 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 ; GFX1013-NEXT: v_mov_b32_e32 v10, 0x40e00000 ; GFX1013-NEXT: v_mov_b32_e32 v11, 0x41000000 ; GFX1013-NEXT: s_waitcnt lgkmcnt(0) -; GFX1013-NEXT: v_mov_b32_e32 v0, s0 -; GFX1013-NEXT: v_mov_b32_e32 v1, s1 +; GFX1013-NEXT: v_mov_b32_e32 v0, s2 +; GFX1013-NEXT: v_mov_b32_e32 v1, s3 ; GFX1013-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX1013-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX1013-NEXT: flat_load_dword v2, v[0:1] ; GFX1013-NEXT: v_mov_b32_e32 v0, 0xb36211c7 -; GFX1013-NEXT: v_bfrev_b32_e32 v1, 4.0 +; GFX1013-NEXT: v_mov_b32_e32 v1, 0x102 ; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[4:7] ; GFX1013-NEXT: s_waitcnt vmcnt(0) @@ -904,30 +903,29 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 ; ; GFX11-LABEL: image_bvh64_intersect_ray_nsa_reassign: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s16, 0xb36211c7 -; GFX11-NEXT: s_mov_b32 s6, 2.0 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX11-NEXT: s_movk_i32 s17, 0x102 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v9, s16 :: v_dual_lshlrev_b32 v2, 2, v0 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x34 ; GFX11-NEXT: s_mov_b32 s8, 0x40400000 ; GFX11-NEXT: s_mov_b32 s12, 0x40c00000 +; GFX11-NEXT: s_mov_b32 s6, 2.0 ; GFX11-NEXT: s_mov_b32 s10, 0x40a00000 ; GFX11-NEXT: s_mov_b32 s9, 4.0 ; GFX11-NEXT: s_mov_b32 s14, 0x41000000 ; GFX11-NEXT: s_mov_b32 s13, 0x40e00000 -; GFX11-NEXT: v_dual_mov_b32 v10, s17 :: v_dual_mov_b32 v3, s8 ; GFX11-NEXT: v_mov_b32_e32 v6, s12 -; GFX11-NEXT: v_mov_b32_e32 v4, s9 -; GFX11-NEXT: v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v5, s10 +; GFX11-NEXT: v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v9, s16 +; GFX11-NEXT: v_dual_mov_b32 v3, s8 :: v_dual_mov_b32 v4, s9 +; GFX11-NEXT: v_mov_b32_e32 v7, s13 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v0, s4 +; GFX11-NEXT: v_mov_b32_e32 v1, s5 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_mov_b32 s5, 1.0 -; GFX11-NEXT: v_mov_b32_e32 v7, s13 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_mov_b32_e32 v10, s17 ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX11-NEXT: flat_load_b32 v11, v[0:1] @@ -959,8 +957,8 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray ; GFX1030-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign: ; GFX1030: ; %bb.0: ; GFX1030-NEXT: s_clause 0x1 -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x34 +; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 ; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX1030-NEXT: v_mov_b32_e32 v3, 0 ; GFX1030-NEXT: v_mov_b32_e32 v4, 1.0 @@ -975,7 +973,7 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray ; GFX1030-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX1030-NEXT: flat_load_dword v2, v[0:1] ; GFX1030-NEXT: v_mov_b32_e32 v0, 0xb36211c6 -; GFX1030-NEXT: v_bfrev_b32_e32 v1, 4.0 +; GFX1030-NEXT: v_mov_b32_e32 v1, 0x102 ; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[0:3] a16 ; GFX1030-NEXT: s_waitcnt vmcnt(0) @@ -985,8 +983,8 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray ; GFX1013-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign: ; GFX1013: ; %bb.0: ; GFX1013-NEXT: s_clause 0x1 -; GFX1013-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1013-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX1013-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX1013-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX1013-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX1013-NEXT: v_mov_b32_e32 v3, 0 ; GFX1013-NEXT: v_mov_b32_e32 v4, 1.0 @@ -995,13 +993,13 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray ; GFX1013-NEXT: v_mov_b32_e32 v7, 0x46004500 ; GFX1013-NEXT: v_mov_b32_e32 v8, 0x48004700 ; GFX1013-NEXT: s_waitcnt lgkmcnt(0) -; GFX1013-NEXT: v_mov_b32_e32 v0, s0 -; GFX1013-NEXT: v_mov_b32_e32 v1, s1 +; GFX1013-NEXT: v_mov_b32_e32 v0, s2 +; GFX1013-NEXT: v_mov_b32_e32 v1, s3 ; GFX1013-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX1013-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX1013-NEXT: flat_load_dword v2, v[0:1] ; GFX1013-NEXT: v_mov_b32_e32 v0, 0xb36211c6 -; GFX1013-NEXT: v_bfrev_b32_e32 v1, 4.0 +; GFX1013-NEXT: v_mov_b32_e32 v1, 0x102 ; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[4:7] a16 ; GFX1013-NEXT: s_waitcnt vmcnt(0) @@ -1010,24 +1008,23 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray ; ; GFX11-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s12, 0xb36211c6 -; GFX11-NEXT: s_mov_b32 s6, 2.0 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX11-NEXT: s_movk_i32 s13, 0x102 +; GFX11-NEXT: s_mov_b32 s6, 2.0 ; GFX11-NEXT: s_mov_b32 s8, 0x42004600 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX11-NEXT: s_mov_b32 s9, 0x44004700 ; GFX11-NEXT: s_mov_b32 s10, 0x45004800 -; GFX11-NEXT: v_dual_mov_b32 v6, s12 :: v_dual_mov_b32 v3, s8 -; GFX11-NEXT: v_mov_b32_e32 v7, s13 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x34 -; GFX11-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v5, s10 +; GFX11-NEXT: v_dual_mov_b32 v3, s8 :: v_dual_mov_b32 v4, s9 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v0, s4 +; GFX11-NEXT: v_mov_b32_e32 v1, s5 ; GFX11-NEXT: s_mov_b32 s5, 1.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: v_dual_mov_b32 v6, s12 :: v_dual_mov_b32 v7, s13 ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX11-NEXT: flat_load_b32 v8, v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll index 5074f8814546e..0c60be9d94591 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll @@ -4,11 +4,11 @@ define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) { ; GCN-LABEL: set_inactive: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 42 ; GCN-NEXT: s_not_b64 exec, exec @@ -23,7 +23,7 @@ define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) { define amdgpu_kernel void @set_inactive_64(ptr addrspace(1) %out, i64 %in) { ; GCN-LABEL: set_inactive_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -43,20 +43,20 @@ define amdgpu_kernel void @set_inactive_64(ptr addrspace(1) %out, i64 %in) { define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x i32> inreg %desc) { ; GCN-LABEL: set_inactive_scc: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_buffer_load_dword s4, s[4:7], 0x0 -; GCN-NEXT: s_load_dword s5, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_buffer_load_dword s2, s[4:7], 0x0 +; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s4, 56 -; GCN-NEXT: s_cselect_b32 s3, 1, 0 -; GCN-NEXT: v_mov_b32_e32 v0, s5 +; GCN-NEXT: s_cmp_lg_u32 s2, 56 +; GCN-NEXT: s_cselect_b32 s4, 1, 0 +; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 42 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: s_mov_b32 s2, 1 -; GCN-NEXT: s_cmp_lg_u32 s3, 0 +; GCN-NEXT: s_cmp_lg_u32 s4, 0 ; GCN-NEXT: s_cbranch_scc0 .LBB2_2 ; GCN-NEXT: ; %bb.1: ; %.one ; GCN-NEXT: v_add_u32_e32 v1, vcc, 1, v0 @@ -96,12 +96,12 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) { ; GCN-LABEL: set_inactive_f32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v1, 0x40400000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: s_not_b64 exec, exec @@ -116,7 +116,7 @@ define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) { ; GCN-LABEL: set_inactive_f64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s4, 0xcccccccd ; GCN-NEXT: s_mov_b32 s5, 0x4010cccc ; GCN-NEXT: v_mov_b32_e32 v2, s4 @@ -140,12 +140,12 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) { define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %in) { ; GCN-LABEL: set_inactive_v2i16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v1, 0x10001 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: s_not_b64 exec, exec @@ -160,12 +160,12 @@ define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> % define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> %in) { ; GCN-LABEL: set_inactive_v2f16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v1, 0x3c003c00 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: s_not_b64 exec, exec @@ -180,7 +180,7 @@ define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> %in) { ; GCN-LABEL: set_inactive_v2i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s4, 1 ; GCN-NEXT: s_mov_b32 s5, s4 ; GCN-NEXT: v_mov_b32_e32 v2, s4 @@ -204,7 +204,7 @@ define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> % define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; GCN-LABEL: set_inactive_v2f32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s4, 1.0 ; GCN-NEXT: s_mov_b32 s5, s4 ; GCN-NEXT: v_mov_b32_e32 v2, s4 @@ -228,12 +228,12 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float> define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in) { ; GCN-LABEL: set_inactive_v2bf16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v1, 0x3f803f80 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: s_not_b64 exec, exec @@ -248,7 +248,7 @@ define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloa define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> %in) { ; GCN-LABEL: set_inactive_v4i16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s4, 0x10001 ; GCN-NEXT: s_mov_b32 s5, s4 ; GCN-NEXT: v_mov_b32_e32 v2, s4 @@ -272,7 +272,7 @@ define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> % define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half> %in) { ; GCN-LABEL: set_inactive_v4f16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s4, 0x3c003c00 ; GCN-NEXT: s_mov_b32 s5, s4 ; GCN-NEXT: v_mov_b32_e32 v2, s4 @@ -296,7 +296,7 @@ define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half> define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloat> %in) { ; GCN-LABEL: set_inactive_v4bf16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s4, 0x3f803f80 ; GCN-NEXT: s_mov_b32 s5, s4 ; GCN-NEXT: v_mov_b32_e32 v2, s4 @@ -320,7 +320,7 @@ define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloa define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) { ; GCN-LABEL: set_inactive_p0: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -340,11 +340,11 @@ define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) { define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(2) %in) { ; GCN-LABEL: set_inactive_p2: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_not_b64 exec, exec @@ -359,11 +359,11 @@ define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(3) %in) { ; GCN-LABEL: set_inactive_p3: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_not_b64 exec, exec @@ -378,11 +378,11 @@ define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(5) %in) { ; GCN-LABEL: set_inactive_p5: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_not_b64 exec, exec @@ -397,11 +397,11 @@ define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @set_inactive_p6(ptr addrspace(1) %out, ptr addrspace(6) %in) { ; GCN-LABEL: set_inactive_p6: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_not_b64 exec, exec diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll index 2198ba9f1d964..1092bb4dc834a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll @@ -6,7 +6,7 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) { ; GFX8-LABEL: dpp_test: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_mov_b32_e32 v0, s3 @@ -19,18 +19,18 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) { ; ; GFX10-LABEL: dpp_test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: dpp_test: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 @@ -46,7 +46,7 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) { define amdgpu_kernel void @update_dppi64_test(ptr addrspace(1) %arg, i64 %in1, i64 %in2) { ; GFX8-LABEL: update_dppi64_test: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -64,7 +64,7 @@ define amdgpu_kernel void @update_dppi64_test(ptr addrspace(1) %arg, i64 %in1, i ; ; GFX10-LABEL: update_dppi64_test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] @@ -78,11 +78,10 @@ define amdgpu_kernel void @update_dppi64_test(ptr addrspace(1) %arg, i64 %in1, i ; ; GFX11-LABEL: update_dppi64_test: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 @@ -102,7 +101,7 @@ define amdgpu_kernel void @update_dppi64_test(ptr addrspace(1) %arg, i64 %in1, i define amdgpu_kernel void @update_dppf64_test(ptr addrspace(1) %arg, double %in1, double %in2) { ; GFX8-LABEL: update_dppf64_test: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -120,7 +119,7 @@ define amdgpu_kernel void @update_dppf64_test(ptr addrspace(1) %arg, double %in1 ; ; GFX10-LABEL: update_dppf64_test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] @@ -134,11 +133,10 @@ define amdgpu_kernel void @update_dppf64_test(ptr addrspace(1) %arg, double %in1 ; ; GFX11-LABEL: update_dppf64_test: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 @@ -158,7 +156,7 @@ define amdgpu_kernel void @update_dppf64_test(ptr addrspace(1) %arg, double %in1 define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32> %in1, <2 x i32> %in2) { ; GFX8-LABEL: update_dppv2i32_test: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -176,25 +174,24 @@ define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32> ; ; GFX10-LABEL: update_dppv2i32_test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] -; GFX10-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] +; GFX10-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-NEXT: v_mov_b32_e32 v3, s7 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 -; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: update_dppv2i32_test: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 @@ -214,7 +211,7 @@ define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32> define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x float> %in1, <2 x float> %in2) { ; GFX8-LABEL: update_dppv2f32_test: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -232,25 +229,24 @@ define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x floa ; ; GFX10-LABEL: update_dppv2f32_test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] -; GFX10-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] +; GFX10-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-NEXT: v_mov_b32_e32 v3, s7 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 -; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: update_dppv2f32_test: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 @@ -270,7 +266,7 @@ define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x floa define amdgpu_kernel void @update_dpp_p0_test(ptr addrspace(1) %arg, ptr %in1, ptr %in2) { ; GFX8-LABEL: update_dpp_p0_test: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -288,7 +284,7 @@ define amdgpu_kernel void @update_dpp_p0_test(ptr addrspace(1) %arg, ptr %in1, p ; ; GFX10-LABEL: update_dpp_p0_test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] @@ -302,11 +298,10 @@ define amdgpu_kernel void @update_dpp_p0_test(ptr addrspace(1) %arg, ptr %in1, p ; ; GFX11-LABEL: update_dpp_p0_test: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 @@ -326,7 +321,7 @@ define amdgpu_kernel void @update_dpp_p0_test(ptr addrspace(1) %arg, ptr %in1, p define amdgpu_kernel void @update_dpp_p3_test(ptr addrspace(3) %arg, ptr addrspace(3) %in1, ptr %in2) { ; GFX8-LABEL: update_dpp_p3_test: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -341,7 +336,7 @@ define amdgpu_kernel void @update_dpp_p3_test(ptr addrspace(3) %arg, ptr addrspa ; ; GFX10-LABEL: update_dpp_p3_test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0 @@ -354,8 +349,7 @@ define amdgpu_kernel void @update_dpp_p3_test(ptr addrspace(3) %arg, ptr addrspa ; ; GFX11-LABEL: update_dpp_p3_test: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_nc_u32_e32 v0, s0, v0 @@ -377,11 +371,11 @@ define amdgpu_kernel void @update_dpp_p5_test(ptr addrspace(5) %arg, ptr addrspa ; GFX8-LABEL: update_dpp_p5_test: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_mov_b32 s90, -1 ; GFX8-NEXT: s_mov_b32 s91, 0xe80000 -; GFX8-NEXT: s_add_u32 s88, s88, s9 +; GFX8-NEXT: s_add_u32 s88, s88, s3 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: s_addc_u32 s89, s89, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -396,27 +390,26 @@ define amdgpu_kernel void @update_dpp_p5_test(ptr addrspace(5) %arg, ptr addrspa ; ; GFX10-LABEL: update_dpp_p5_test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-NEXT: s_mov_b32 s14, -1 -; GFX10-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-NEXT: s_add_u32 s12, s12, s9 -; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 +; GFX10-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s7, 0x31c16000 +; GFX10-NEXT: s_add_u32 s4, s4, s3 +; GFX10-NEXT: s_addc_u32 s5, s5, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-NEXT: buffer_load_dword v1, v0, s[12:15], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v0, s[4:7], 0 offen ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 -; GFX10-NEXT: buffer_store_dword v2, v0, s[12:15], 0 offen +; GFX10-NEXT: buffer_store_dword v2, v0, s[4:7], 0 offen ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: update_dpp_p5_test: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_nc_u32_e32 v0, s0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll index cf69c50ed9357..8a2274cbfbf62 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll @@ -6,83 +6,83 @@ define amdgpu_kernel void @sdivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i32 %x, i32 %y) { ; GFX8-LABEL: sdivrem_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_ashr_i32 s8, s5, 31 -; GFX8-NEXT: s_add_i32 s0, s5, s8 -; GFX8-NEXT: s_xor_b32 s5, s0, s8 -; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX8-NEXT: s_sub_i32 s0, 0, s5 +; GFX8-NEXT: s_ashr_i32 s8, s7, 31 +; GFX8-NEXT: s_add_i32 s0, s7, s8 +; GFX8-NEXT: s_xor_b32 s7, s0, s8 +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX8-NEXT: s_sub_i32 s0, 0, s7 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX8-NEXT: s_ashr_i32 s6, s4, 31 -; GFX8-NEXT: s_add_i32 s4, s4, s6 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: s_ashr_i32 s4, s6, 31 +; GFX8-NEXT: s_add_i32 s5, s6, s4 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX8-NEXT: s_xor_b32 s4, s4, s6 -; GFX8-NEXT: s_xor_b32 s7, s6, s8 +; GFX8-NEXT: s_xor_b32 s5, s5, s4 +; GFX8-NEXT: s_xor_b32 s6, s4, s8 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 -; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 +; GFX8-NEXT: v_mul_hi_u32 v2, s5, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mul_lo_u32 v3, v2, s5 +; GFX8-NEXT: v_mul_lo_u32 v3, v2, s7 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s4, v3 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s5, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 -; GFX8-NEXT: v_xor_b32_e32 v2, s7, v2 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 +; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s7, v2 -; GFX8-NEXT: v_xor_b32_e32 v3, s6, v3 +; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s6, v2 +; GFX8-NEXT: v_xor_b32_e32 v3, s4, v3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s6, v3 +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s4, v3 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_dword v[0:1], v3 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: sdivrem_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s4, s1, 31 -; GFX9-NEXT: s_add_i32 s1, s1, s4 -; GFX9-NEXT: s_xor_b32 s5, s1, s4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX9-NEXT: s_sub_i32 s1, 0, s5 +; GFX9-NEXT: s_ashr_i32 s6, s1, 31 +; GFX9-NEXT: s_add_i32 s1, s1, s6 +; GFX9-NEXT: s_xor_b32 s7, s1, s6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX9-NEXT: s_sub_i32 s1, 0, s7 ; GFX9-NEXT: s_ashr_i32 s8, s0, 31 ; GFX9-NEXT: s_add_i32 s0, s0, s8 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_xor_b32 s9, s0, s8 -; GFX9-NEXT: s_xor_b32 s4, s8, s4 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_xor_b32 s4, s8, s6 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, v0, s5 +; GFX9-NEXT: v_mul_lo_u32 v1, v0, s7 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 ; GFX9-NEXT: v_sub_u32_e32 v1, s9, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 ; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 @@ -95,17 +95,16 @@ define amdgpu_kernel void @sdivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX10-LABEL: sdivrem_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_ashr_i32 s4, s1, 31 +; GFX10-NEXT: s_ashr_i32 s6, s1, 31 ; GFX10-NEXT: s_ashr_i32 s8, s0, 31 -; GFX10-NEXT: s_add_i32 s1, s1, s4 +; GFX10-NEXT: s_add_i32 s1, s1, s6 ; GFX10-NEXT: s_add_i32 s0, s0, s8 -; GFX10-NEXT: s_xor_b32 s5, s1, s4 +; GFX10-NEXT: s_xor_b32 s7, s1, s6 ; GFX10-NEXT: s_xor_b32 s0, s0, s8 -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX10-NEXT: s_sub_i32 s1, 0, s5 -; GFX10-NEXT: s_xor_b32 s4, s8, s4 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX10-NEXT: s_sub_i32 s1, 0, s7 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -113,17 +112,18 @@ define amdgpu_kernel void @sdivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX10-NEXT: v_mul_lo_u32 v1, v0, s5 +; GFX10-NEXT: v_mul_lo_u32 v1, v0, s7 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s0, v1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s5, v1 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s5, v1 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_xor_b32 s4, s8, s6 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s7, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s5, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s5, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s7, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -145,7 +145,7 @@ define amdgpu_kernel void @sdivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i64 %x, i64 %y) { ; GFX8-LABEL: sdivrem_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_ashr_i32 s2, s9, 31 ; GFX8-NEXT: s_ashr_i32 s12, s11, 31 @@ -305,7 +305,7 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX9-LABEL: sdivrem_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s2, s9, 31 ; GFX9-NEXT: s_ashr_i32 s12, s11, 31 @@ -459,7 +459,7 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX10-LABEL: sdivrem_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_ashr_i32 s2, s9, 31 ; GFX10-NEXT: s_ashr_i32 s12, s11, 31 @@ -616,7 +616,7 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i32> %x, <2 x i32> %y) { ; GFX8-LABEL: sdivrem_v2i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_ashr_i32 s2, s10, 31 ; GFX8-NEXT: s_add_i32 s0, s10, s2 @@ -692,121 +692,121 @@ define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX9-LABEL: sdivrem_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s8, s6, 31 -; GFX9-NEXT: s_add_i32 s6, s6, s8 -; GFX9-NEXT: s_xor_b32 s6, s6, s8 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX9-NEXT: s_ashr_i32 s9, s7, 31 -; GFX9-NEXT: s_add_i32 s7, s7, s9 -; GFX9-NEXT: s_xor_b32 s7, s7, s9 +; GFX9-NEXT: s_ashr_i32 s0, s14, 31 +; GFX9-NEXT: s_add_i32 s1, s14, s0 +; GFX9-NEXT: s_xor_b32 s1, s1, s0 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 +; GFX9-NEXT: s_ashr_i32 s2, s15, 31 +; GFX9-NEXT: s_add_i32 s3, s15, s2 +; GFX9-NEXT: s_xor_b32 s3, s3, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 -; GFX9-NEXT: s_sub_i32 s12, 0, s6 -; GFX9-NEXT: s_ashr_i32 s10, s4, 31 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3 +; GFX9-NEXT: s_sub_i32 s6, 0, s1 +; GFX9-NEXT: s_ashr_i32 s4, s12, 31 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: s_add_i32 s4, s4, s10 -; GFX9-NEXT: s_xor_b32 s4, s4, s10 -; GFX9-NEXT: v_mul_lo_u32 v2, s12, v0 +; GFX9-NEXT: s_sub_i32 s7, 0, s3 +; GFX9-NEXT: s_ashr_i32 s5, s13, 31 +; GFX9-NEXT: v_mul_lo_u32 v2, s6, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: s_sub_i32 s12, 0, s7 +; GFX9-NEXT: s_add_i32 s6, s12, s4 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX9-NEXT: s_ashr_i32 s11, s5, 31 -; GFX9-NEXT: v_mul_lo_u32 v3, s12, v1 -; GFX9-NEXT: s_add_i32 s5, s5, s11 +; GFX9-NEXT: s_xor_b32 s6, s6, s4 +; GFX9-NEXT: v_mul_lo_u32 v3, s7, v1 +; GFX9-NEXT: s_add_i32 s7, s13, s5 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, s6, v0 ; GFX9-NEXT: v_mul_hi_u32 v2, v1, v3 -; GFX9-NEXT: s_xor_b32 s5, s5, s11 -; GFX9-NEXT: v_mul_lo_u32 v3, v0, s6 +; GFX9-NEXT: s_xor_b32 s7, s7, s5 +; GFX9-NEXT: s_xor_b32 s0, s4, s0 +; GFX9-NEXT: v_mul_lo_u32 v3, v0, s1 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 ; GFX9-NEXT: v_add_u32_e32 v2, 1, v0 -; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 -; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 +; GFX9-NEXT: v_mul_hi_u32 v1, s7, v1 +; GFX9-NEXT: v_sub_u32_e32 v3, s6, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s1, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_subrev_u32_e32 v2, s6, v3 +; GFX9-NEXT: v_subrev_u32_e32 v2, s1, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s1, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s6, v2 +; GFX9-NEXT: v_subrev_u32_e32 v3, s1, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, v1, s7 +; GFX9-NEXT: v_mul_lo_u32 v3, v1, s3 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 -; GFX9-NEXT: s_xor_b32 s4, s10, s8 -; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 -; GFX9-NEXT: v_sub_u32_e32 v3, s5, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 +; GFX9-NEXT: v_xor_b32_e32 v0, s0, v0 +; GFX9-NEXT: v_subrev_u32_e32 v0, s0, v0 +; GFX9-NEXT: v_sub_u32_e32 v3, s7, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s7, v3 +; GFX9-NEXT: v_subrev_u32_e32 v4, s3, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 -; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s7, v3 -; GFX9-NEXT: s_xor_b32 s4, s11, s9 +; GFX9-NEXT: v_subrev_u32_e32 v4, s3, v3 +; GFX9-NEXT: s_xor_b32 s0, s5, s2 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX9-NEXT: v_xor_b32_e32 v1, s4, v1 -; GFX9-NEXT: v_xor_b32_e32 v2, s10, v2 -; GFX9-NEXT: v_subrev_u32_e32 v1, s4, v1 -; GFX9-NEXT: v_xor_b32_e32 v3, s11, v3 +; GFX9-NEXT: v_xor_b32_e32 v1, s0, v1 +; GFX9-NEXT: v_xor_b32_e32 v2, s4, v2 +; GFX9-NEXT: v_subrev_u32_e32 v1, s0, v1 +; GFX9-NEXT: v_xor_b32_e32 v3, s5, v3 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_subrev_u32_e32 v2, s10, v2 -; GFX9-NEXT: v_subrev_u32_e32 v3, s11, v3 -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] -; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3] +; GFX9-NEXT: v_subrev_u32_e32 v2, s4, v2 +; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v3 +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9] +; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: sdivrem_v2i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_ashr_i32 s1, s10, 31 -; GFX10-NEXT: s_ashr_i32 s2, s11, 31 -; GFX10-NEXT: s_add_i32 s0, s10, s1 -; GFX10-NEXT: s_add_i32 s3, s11, s2 -; GFX10-NEXT: s_xor_b32 s10, s0, s1 +; GFX10-NEXT: s_ashr_i32 s1, s14, 31 +; GFX10-NEXT: s_ashr_i32 s2, s15, 31 +; GFX10-NEXT: s_add_i32 s0, s14, s1 +; GFX10-NEXT: s_add_i32 s3, s15, s2 +; GFX10-NEXT: s_xor_b32 s4, s0, s1 ; GFX10-NEXT: s_xor_b32 s3, s3, s2 -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s10 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s4 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GFX10-NEXT: s_sub_i32 s0, 0, s10 -; GFX10-NEXT: s_sub_i32 s11, 0, s3 -; GFX10-NEXT: s_ashr_i32 s12, s9, 31 +; GFX10-NEXT: s_sub_i32 s0, 0, s4 +; GFX10-NEXT: s_sub_i32 s5, 0, s3 +; GFX10-NEXT: s_ashr_i32 s6, s13, 31 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX10-NEXT: s_add_i32 s7, s13, s6 +; GFX10-NEXT: s_xor_b32 s7, s7, s6 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_lo_u32 v2, s0, v0 -; GFX10-NEXT: v_mul_lo_u32 v3, s11, v1 -; GFX10-NEXT: s_ashr_i32 s11, s8, 31 -; GFX10-NEXT: s_add_i32 s0, s8, s11 -; GFX10-NEXT: s_add_i32 s8, s9, s12 -; GFX10-NEXT: s_xor_b32 s0, s0, s11 -; GFX10-NEXT: s_xor_b32 s8, s8, s12 +; GFX10-NEXT: v_mul_lo_u32 v3, s5, v1 +; GFX10-NEXT: s_ashr_i32 s5, s12, 31 +; GFX10-NEXT: s_add_i32 s0, s12, s5 +; GFX10-NEXT: s_xor_b32 s1, s5, s1 +; GFX10-NEXT: s_xor_b32 s0, s0, s5 ; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX10-NEXT: s_xor_b32 s1, s11, s1 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 ; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX10-NEXT: v_mul_hi_u32 v1, s8, v1 -; GFX10-NEXT: v_mul_lo_u32 v2, v0, s10 +; GFX10-NEXT: v_mul_hi_u32 v1, s7, v1 +; GFX10-NEXT: v_mul_lo_u32 v2, v0, s4 ; GFX10-NEXT: v_mul_lo_u32 v3, v1, s3 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 ; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, s0, v2 -; GFX10-NEXT: v_sub_nc_u32_e32 v3, s8, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s10, v2 +; GFX10-NEXT: v_sub_nc_u32_e32 v3, s7, v3 +; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s4, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s3, v3 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s10, v2 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s4, v2 ; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s3, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 @@ -814,26 +814,26 @@ define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s10, v2 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s4, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s3, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s10, v2 +; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s4, v2 ; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s3, v3 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo -; GFX10-NEXT: s_xor_b32 s0, s12, s2 +; GFX10-NEXT: s_xor_b32 s0, s6, s2 ; GFX10-NEXT: v_xor_b32_e32 v0, s1, v0 ; GFX10-NEXT: v_xor_b32_e32 v1, s0, v1 -; GFX10-NEXT: v_xor_b32_e32 v2, s11, v2 -; GFX10-NEXT: v_xor_b32_e32 v3, s12, v3 +; GFX10-NEXT: v_xor_b32_e32 v2, s5, v2 +; GFX10-NEXT: v_xor_b32_e32 v3, s6, v3 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_subrev_nc_u32_e32 v0, s1, v0 ; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s0, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s11, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s12, v3 -; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] -; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7] +; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s5, v2 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s6, v3 +; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9] +; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11] ; GFX10-NEXT: s_endpgm %div = sdiv <2 x i32> %x, %y store <2 x i32> %div, ptr addrspace(1) %out0 @@ -845,8 +845,8 @@ define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <4 x i32> %x, <4 x i32> %y) { ; GFX8-LABEL: sdivrem_v4i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_ashr_i32 s2, s12, 31 ; GFX8-NEXT: s_add_i32 s0, s12, s2 @@ -986,19 +986,19 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX9-LABEL: sdivrem_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s4, s12, 31 -; GFX9-NEXT: s_add_i32 s0, s12, s4 -; GFX9-NEXT: s_xor_b32 s5, s0, s4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_ashr_i32 s6, s13, 31 -; GFX9-NEXT: s_add_i32 s7, s13, s6 +; GFX9-NEXT: s_ashr_i32 s6, s12, 31 +; GFX9-NEXT: s_add_i32 s0, s12, s6 +; GFX9-NEXT: s_xor_b32 s7, s0, s6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_ashr_i32 s4, s13, 31 +; GFX9-NEXT: s_add_i32 s5, s13, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_xor_b32 s7, s7, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 -; GFX9-NEXT: s_sub_i32 s13, 0, s5 +; GFX9-NEXT: s_xor_b32 s5, s5, s4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s5 +; GFX9-NEXT: s_sub_i32 s13, 0, s7 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 @@ -1009,7 +1009,7 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: s_xor_b32 s8, s8, s12 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX9-NEXT: s_sub_i32 s13, 0, s7 +; GFX9-NEXT: s_sub_i32 s13, 0, s5 ; GFX9-NEXT: v_mul_lo_u32 v3, s13, v1 ; GFX9-NEXT: s_ashr_i32 s13, s9, 31 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 @@ -1017,62 +1017,62 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_mul_hi_u32 v2, v1, v3 ; GFX9-NEXT: s_add_i32 s9, s9, s13 ; GFX9-NEXT: s_xor_b32 s9, s9, s13 -; GFX9-NEXT: v_mul_lo_u32 v3, v0, s5 +; GFX9-NEXT: v_mul_lo_u32 v3, v0, s7 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 ; GFX9-NEXT: v_add_u32_e32 v2, 1, v0 ; GFX9-NEXT: v_mul_hi_u32 v1, s9, v1 ; GFX9-NEXT: v_sub_u32_e32 v3, s8, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_subrev_u32_e32 v2, s5, v3 +; GFX9-NEXT: v_subrev_u32_e32 v2, s7, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v2 -; GFX9-NEXT: s_xor_b32 s4, s12, s4 +; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v2 +; GFX9-NEXT: s_xor_b32 s6, s12, s6 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 -; GFX9-NEXT: v_mul_lo_u32 v3, v1, s7 -; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 -; GFX9-NEXT: s_ashr_i32 s4, s14, 31 -; GFX9-NEXT: s_add_i32 s5, s14, s4 +; GFX9-NEXT: v_xor_b32_e32 v0, s6, v0 +; GFX9-NEXT: v_mul_lo_u32 v3, v1, s5 +; GFX9-NEXT: v_subrev_u32_e32 v0, s6, v0 +; GFX9-NEXT: s_ashr_i32 s6, s14, 31 +; GFX9-NEXT: s_add_i32 s7, s14, s6 ; GFX9-NEXT: v_xor_b32_e32 v2, s12, v2 -; GFX9-NEXT: s_xor_b32 s5, s5, s4 +; GFX9-NEXT: s_xor_b32 s7, s7, s6 ; GFX9-NEXT: v_subrev_u32_e32 v4, s12, v2 ; GFX9-NEXT: v_sub_u32_e32 v2, s9, v3 -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s5 +; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s7 ; GFX9-NEXT: v_add_u32_e32 v5, 1, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GFX9-NEXT: v_subrev_u32_e32 v5, s7, v2 +; GFX9-NEXT: v_subrev_u32_e32 v5, s5, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; GFX9-NEXT: v_add_u32_e32 v5, 1, v1 ; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 -; GFX9-NEXT: s_sub_i32 s8, 0, s5 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v2 +; GFX9-NEXT: s_sub_i32 s8, 0, s7 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: v_mul_lo_u32 v5, s8, v3 -; GFX9-NEXT: s_xor_b32 s6, s13, s6 -; GFX9-NEXT: v_xor_b32_e32 v1, s6, v1 -; GFX9-NEXT: v_subrev_u32_e32 v1, s6, v1 -; GFX9-NEXT: s_ashr_i32 s6, s15, 31 -; GFX9-NEXT: s_add_i32 s9, s15, s6 +; GFX9-NEXT: s_xor_b32 s4, s13, s4 +; GFX9-NEXT: v_xor_b32_e32 v1, s4, v1 +; GFX9-NEXT: v_subrev_u32_e32 v1, s4, v1 +; GFX9-NEXT: s_ashr_i32 s4, s15, 31 +; GFX9-NEXT: s_add_i32 s9, s15, s4 ; GFX9-NEXT: v_mul_hi_u32 v5, v3, v5 -; GFX9-NEXT: s_xor_b32 s9, s9, s6 +; GFX9-NEXT: s_xor_b32 s9, s9, s4 ; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s9 -; GFX9-NEXT: v_subrev_u32_e32 v6, s7, v2 -; GFX9-NEXT: s_ashr_i32 s7, s10, 31 -; GFX9-NEXT: s_add_i32 s8, s10, s7 -; GFX9-NEXT: s_xor_b32 s8, s8, s7 +; GFX9-NEXT: v_subrev_u32_e32 v6, s5, v2 +; GFX9-NEXT: s_ashr_i32 s5, s10, 31 +; GFX9-NEXT: s_add_i32 s8, s10, s5 +; GFX9-NEXT: s_xor_b32 s8, s8, s5 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 ; GFX9-NEXT: v_mul_hi_u32 v3, s8, v3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; GFX9-NEXT: v_xor_b32_e32 v2, s13, v2 -; GFX9-NEXT: v_mul_lo_u32 v6, v3, s5 +; GFX9-NEXT: v_mul_lo_u32 v6, v3, s7 ; GFX9-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v7 ; GFX9-NEXT: v_subrev_u32_e32 v5, s13, v2 @@ -1080,27 +1080,27 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: s_sub_i32 s8, 0, s9 ; GFX9-NEXT: v_mul_lo_u32 v8, s8, v7 ; GFX9-NEXT: v_add_u32_e32 v6, 1, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc -; GFX9-NEXT: v_subrev_u32_e32 v6, s5, v2 +; GFX9-NEXT: v_subrev_u32_e32 v6, s7, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; GFX9-NEXT: v_mul_hi_u32 v8, v7, v8 ; GFX9-NEXT: v_add_u32_e32 v6, 1, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc -; GFX9-NEXT: v_subrev_u32_e32 v6, s5, v2 -; GFX9-NEXT: s_ashr_i32 s5, s11, 31 -; GFX9-NEXT: s_add_i32 s8, s11, s5 -; GFX9-NEXT: s_xor_b32 s8, s8, s5 +; GFX9-NEXT: v_subrev_u32_e32 v6, s7, v2 +; GFX9-NEXT: s_ashr_i32 s7, s11, 31 +; GFX9-NEXT: s_add_i32 s8, s11, s7 +; GFX9-NEXT: s_xor_b32 s8, s8, s7 ; GFX9-NEXT: v_add_u32_e32 v7, v7, v8 ; GFX9-NEXT: v_mul_hi_u32 v7, s8, v7 -; GFX9-NEXT: s_xor_b32 s4, s7, s4 +; GFX9-NEXT: s_xor_b32 s6, s5, s6 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v6, vcc -; GFX9-NEXT: v_xor_b32_e32 v2, s4, v3 +; GFX9-NEXT: v_xor_b32_e32 v2, s6, v3 ; GFX9-NEXT: v_mul_lo_u32 v3, v7, s9 ; GFX9-NEXT: v_add_u32_e32 v8, 1, v7 -; GFX9-NEXT: v_subrev_u32_e32 v2, s4, v2 -; GFX9-NEXT: s_xor_b32 s4, s5, s6 +; GFX9-NEXT: s_xor_b32 s4, s7, s4 +; GFX9-NEXT: v_subrev_u32_e32 v2, s6, v2 ; GFX9-NEXT: v_sub_u32_e32 v3, s8, v3 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc @@ -1112,12 +1112,12 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_subrev_u32_e32 v8, s9, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v3, v8, vcc ; GFX9-NEXT: v_xor_b32_e32 v3, s4, v7 -; GFX9-NEXT: v_xor_b32_e32 v6, s7, v6 +; GFX9-NEXT: v_xor_b32_e32 v6, s5, v6 ; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v3 -; GFX9-NEXT: v_xor_b32_e32 v7, s5, v8 +; GFX9-NEXT: v_xor_b32_e32 v7, s7, v8 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-NEXT: v_subrev_u32_e32 v6, s7, v6 -; GFX9-NEXT: v_subrev_u32_e32 v7, s5, v7 +; GFX9-NEXT: v_subrev_u32_e32 v6, s5, v6 +; GFX9-NEXT: v_subrev_u32_e32 v7, s7, v7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[2:3] @@ -1125,18 +1125,18 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX10-LABEL: sdivrem_v4i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_ashr_i32 s0, s12, 31 ; GFX10-NEXT: s_ashr_i32 s1, s13, 31 ; GFX10-NEXT: s_ashr_i32 s2, s14, 31 ; GFX10-NEXT: s_ashr_i32 s3, s15, 31 -; GFX10-NEXT: s_add_i32 s4, s12, s0 -; GFX10-NEXT: s_add_i32 s5, s13, s1 +; GFX10-NEXT: s_add_i32 s6, s12, s0 +; GFX10-NEXT: s_add_i32 s7, s13, s1 ; GFX10-NEXT: s_add_i32 s12, s14, s2 ; GFX10-NEXT: s_add_i32 s13, s15, s3 -; GFX10-NEXT: s_xor_b32 s14, s4, s0 -; GFX10-NEXT: s_xor_b32 s15, s5, s1 +; GFX10-NEXT: s_xor_b32 s14, s6, s0 +; GFX10-NEXT: s_xor_b32 s15, s7, s1 ; GFX10-NEXT: s_xor_b32 s12, s12, s2 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s14 ; GFX10-NEXT: s_xor_b32 s13, s13, s3 @@ -1144,11 +1144,11 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s12 ; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s13 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX10-NEXT: s_sub_i32 s4, 0, s14 +; GFX10-NEXT: s_sub_i32 s6, 0, s14 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GFX10-NEXT: s_sub_i32 s5, 0, s15 +; GFX10-NEXT: s_sub_i32 s7, 0, s15 ; GFX10-NEXT: s_sub_i32 s19, 0, s12 ; GFX10-NEXT: s_ashr_i32 s16, s8, 31 ; GFX10-NEXT: s_ashr_i32 s17, s9, 31 @@ -1163,22 +1163,22 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX10-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX10-NEXT: v_mul_lo_u32 v4, s4, v0 -; GFX10-NEXT: s_sub_i32 s4, 0, s13 -; GFX10-NEXT: v_mul_lo_u32 v5, s5, v1 +; GFX10-NEXT: v_mul_lo_u32 v4, s6, v0 +; GFX10-NEXT: s_sub_i32 s6, 0, s13 +; GFX10-NEXT: v_mul_lo_u32 v5, s7, v1 ; GFX10-NEXT: v_mul_lo_u32 v6, s19, v2 -; GFX10-NEXT: v_mul_lo_u32 v7, s4, v3 +; GFX10-NEXT: v_mul_lo_u32 v7, s6, v3 ; GFX10-NEXT: s_ashr_i32 s19, s11, 31 -; GFX10-NEXT: s_add_i32 s4, s8, s16 -; GFX10-NEXT: s_add_i32 s5, s9, s17 +; GFX10-NEXT: s_add_i32 s6, s8, s16 +; GFX10-NEXT: s_add_i32 s7, s9, s17 ; GFX10-NEXT: v_mul_hi_u32 v4, v0, v4 ; GFX10-NEXT: s_add_i32 s8, s10, s18 ; GFX10-NEXT: v_mul_hi_u32 v5, v1, v5 ; GFX10-NEXT: v_mul_hi_u32 v6, v2, v6 ; GFX10-NEXT: v_mul_hi_u32 v7, v3, v7 ; GFX10-NEXT: s_add_i32 s9, s11, s19 -; GFX10-NEXT: s_xor_b32 s10, s4, s16 -; GFX10-NEXT: s_xor_b32 s11, s5, s17 +; GFX10-NEXT: s_xor_b32 s10, s6, s16 +; GFX10-NEXT: s_xor_b32 s11, s7, s17 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v4 ; GFX10-NEXT: s_xor_b32 s8, s8, s18 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v5 @@ -1190,7 +1190,7 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_mul_hi_u32 v2, s8, v2 ; GFX10-NEXT: v_mul_hi_u32 v3, s9, v3 ; GFX10-NEXT: s_xor_b32 s22, s18, s2 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX10-NEXT: v_mul_lo_u32 v4, v0, s14 ; GFX10-NEXT: v_mul_lo_u32 v5, v1, s15 ; GFX10-NEXT: v_mul_lo_u32 v6, v2, s12 @@ -1271,8 +1271,8 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i64> %x, <2 x i64> %y) { ; GFX8-LABEL: sdivrem_v2i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x20 +; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_ashr_i32 s4, s13, 31 ; GFX8-NEXT: s_ashr_i32 s6, s1, 31 @@ -1582,8 +1582,8 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX9-LABEL: sdivrem_v2i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x20 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s4, s13, 31 ; GFX9-NEXT: s_ashr_i32 s6, s1, 31 @@ -1885,8 +1885,8 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-LABEL: sdivrem_v2i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x20 -; GFX10-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_ashr_i32 s16, s1, 31 ; GFX10-NEXT: s_ashr_i32 s4, s13, 31 @@ -2187,25 +2187,25 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i8 %x, i8 %y) { ; GFX8-LABEL: sdiv_i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX8-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_i32 s0, s4, 0x80008 -; GFX8-NEXT: s_ashr_i32 s5, s0, 31 -; GFX8-NEXT: s_add_i32 s0, s0, s5 -; GFX8-NEXT: s_xor_b32 s8, s0, s5 +; GFX8-NEXT: s_bfe_i32 s0, s6, 0x80008 +; GFX8-NEXT: s_ashr_i32 s7, s0, 31 +; GFX8-NEXT: s_add_i32 s0, s0, s7 +; GFX8-NEXT: s_xor_b32 s8, s0, s7 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX8-NEXT: s_sub_i32 s0, 0, s8 -; GFX8-NEXT: s_sext_i32_i8 s4, s4 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX8-NEXT: s_ashr_i32 s6, s4, 31 -; GFX8-NEXT: s_add_i32 s4, s4, s6 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: s_sext_i32_i8 s4, s6 +; GFX8-NEXT: s_ashr_i32 s5, s4, 31 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX8-NEXT: s_xor_b32 s4, s4, s6 -; GFX8-NEXT: s_xor_b32 s5, s6, s5 +; GFX8-NEXT: s_add_i32 s4, s4, s5 +; GFX8-NEXT: s_xor_b32 s4, s4, s5 +; GFX8-NEXT: s_xor_b32 s6, s5, s7 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -2222,52 +2222,52 @@ define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s8, v3 -; GFX8-NEXT: v_xor_b32_e32 v2, s5, v2 +; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s5, v2 -; GFX8-NEXT: v_xor_b32_e32 v3, s6, v3 +; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s6, v2 +; GFX8-NEXT: v_xor_b32_e32 v3, s5, v3 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s6, v3 +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s5, v3 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_byte v[0:1], v3 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x10 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_i32 s1, s0, 0x80008 -; GFX9-NEXT: s_ashr_i32 s4, s1, 31 -; GFX9-NEXT: s_add_i32 s1, s1, s4 -; GFX9-NEXT: s_xor_b32 s5, s1, s4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX9-NEXT: s_sub_i32 s1, 0, s5 +; GFX9-NEXT: s_ashr_i32 s6, s1, 31 +; GFX9-NEXT: s_add_i32 s1, s1, s6 +; GFX9-NEXT: s_xor_b32 s7, s1, s6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX9-NEXT: s_sub_i32 s1, 0, s7 ; GFX9-NEXT: s_sext_i32_i8 s0, s0 ; GFX9-NEXT: s_ashr_i32 s8, s0, 31 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_add_i32 s0, s0, s8 ; GFX9-NEXT: s_xor_b32 s9, s0, s8 -; GFX9-NEXT: s_xor_b32 s4, s8, s4 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_xor_b32 s4, s8, s6 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, v0, s5 +; GFX9-NEXT: v_mul_lo_u32 v1, v0, s7 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 ; GFX9-NEXT: v_sub_u32_e32 v1, s9, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 ; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 @@ -2280,19 +2280,18 @@ define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out ; ; GFX10-LABEL: sdiv_i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x10 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_bfe_i32 s1, s0, 0x80008 ; GFX10-NEXT: s_sext_i32_i8 s0, s0 -; GFX10-NEXT: s_ashr_i32 s4, s1, 31 +; GFX10-NEXT: s_ashr_i32 s6, s1, 31 ; GFX10-NEXT: s_ashr_i32 s8, s0, 31 -; GFX10-NEXT: s_add_i32 s1, s1, s4 +; GFX10-NEXT: s_add_i32 s1, s1, s6 ; GFX10-NEXT: s_add_i32 s0, s0, s8 -; GFX10-NEXT: s_xor_b32 s5, s1, s4 +; GFX10-NEXT: s_xor_b32 s7, s1, s6 ; GFX10-NEXT: s_xor_b32 s0, s0, s8 -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX10-NEXT: s_sub_i32 s1, 0, s5 -; GFX10-NEXT: s_xor_b32 s4, s8, s4 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX10-NEXT: s_sub_i32 s1, 0, s7 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -2300,17 +2299,18 @@ define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out ; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX10-NEXT: v_mul_lo_u32 v1, v0, s5 +; GFX10-NEXT: v_mul_lo_u32 v1, v0, s7 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s0, v1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s5, v1 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s5, v1 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_xor_b32 s4, s8, s6 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s7, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s5, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s5, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s7, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -2332,14 +2332,14 @@ define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i8> %x, <2 x i8> %y) { ; GFX8-LABEL: sdivrem_v2i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x10 +; GFX8-NEXT: s_load_dword s2, s[4:5], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_i32 s0, s2, 0x80010 ; GFX8-NEXT: s_ashr_i32 s3, s0, 31 ; GFX8-NEXT: s_add_i32 s0, s0, s3 ; GFX8-NEXT: s_xor_b32 s8, s0, s3 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX8-NEXT: s_sub_i32 s4, 0, s8 +; GFX8-NEXT: s_sub_i32 s6, 0, s8 ; GFX8-NEXT: s_bfe_i32 s1, s2, 0x80018 ; GFX8-NEXT: s_ashr_i32 s10, s1, 31 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -2351,10 +2351,10 @@ define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s11 ; GFX8-NEXT: s_ashr_i32 s9, s0, 31 ; GFX8-NEXT: s_add_i32 s0, s0, s9 -; GFX8-NEXT: v_mul_lo_u32 v1, s4, v0 +; GFX8-NEXT: v_mul_lo_u32 v1, s6, v0 ; GFX8-NEXT: s_xor_b32 s0, s0, s9 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0 @@ -2420,45 +2420,45 @@ define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX9-LABEL: sdivrem_v2i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s0, s4, 0x80010 -; GFX9-NEXT: s_ashr_i32 s5, s0, 31 -; GFX9-NEXT: s_add_i32 s0, s0, s5 -; GFX9-NEXT: s_xor_b32 s8, s0, s5 +; GFX9-NEXT: s_bfe_i32 s0, s6, 0x80010 +; GFX9-NEXT: s_ashr_i32 s7, s0, 31 +; GFX9-NEXT: s_add_i32 s0, s0, s7 +; GFX9-NEXT: s_xor_b32 s8, s0, s7 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_bfe_i32 s7, s4, 0x80018 -; GFX9-NEXT: s_ashr_i32 s9, s7, 31 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_bfe_i32 s5, s6, 0x80018 +; GFX9-NEXT: s_ashr_i32 s9, s5, 31 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_add_i32 s7, s7, s9 -; GFX9-NEXT: s_xor_b32 s7, s7, s9 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 +; GFX9-NEXT: s_add_i32 s5, s5, s9 +; GFX9-NEXT: s_xor_b32 s5, s5, s9 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s5 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: s_sub_i32 s10, 0, s8 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: s_sext_i32_i8 s6, s4 +; GFX9-NEXT: s_sext_i32_i8 s4, s6 ; GFX9-NEXT: v_mul_lo_u32 v2, s10, v0 -; GFX9-NEXT: s_ashr_i32 s10, s6, 31 +; GFX9-NEXT: s_ashr_i32 s10, s4, 31 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX9-NEXT: s_add_i32 s6, s6, s10 -; GFX9-NEXT: s_xor_b32 s6, s6, s10 -; GFX9-NEXT: s_sub_i32 s11, 0, s7 +; GFX9-NEXT: s_add_i32 s4, s4, s10 +; GFX9-NEXT: s_xor_b32 s4, s4, s10 +; GFX9-NEXT: s_sub_i32 s11, 0, s5 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v0, s6, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX9-NEXT: v_mul_lo_u32 v2, s11, v1 -; GFX9-NEXT: s_bfe_i32 s4, s4, 0x80008 -; GFX9-NEXT: s_ashr_i32 s11, s4, 31 +; GFX9-NEXT: s_bfe_i32 s6, s6, 0x80008 +; GFX9-NEXT: s_ashr_i32 s11, s6, 31 ; GFX9-NEXT: v_mul_lo_u32 v3, v0, s8 ; GFX9-NEXT: v_mul_hi_u32 v2, v1, v2 -; GFX9-NEXT: s_add_i32 s4, s4, s11 +; GFX9-NEXT: s_add_i32 s6, s6, s11 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 -; GFX9-NEXT: v_sub_u32_e32 v3, s6, v3 +; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 -; GFX9-NEXT: s_xor_b32 s4, s4, s11 +; GFX9-NEXT: s_xor_b32 s4, s6, s11 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX9-NEXT: v_subrev_u32_e32 v4, s8, v3 @@ -2469,25 +2469,25 @@ define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX9-NEXT: v_subrev_u32_e32 v4, s8, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, v1, s7 +; GFX9-NEXT: v_mul_lo_u32 v3, v1, s5 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 -; GFX9-NEXT: s_xor_b32 s5, s10, s5 -; GFX9-NEXT: v_xor_b32_e32 v0, s5, v0 +; GFX9-NEXT: s_xor_b32 s6, s10, s7 +; GFX9-NEXT: v_xor_b32_e32 v0, s6, v0 ; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s7, v3 +; GFX9-NEXT: v_subrev_u32_e32 v4, s5, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX9-NEXT: s_xor_b32 s4, s11, s9 ; GFX9-NEXT: v_xor_b32_e32 v1, s4, v1 -; GFX9-NEXT: v_subrev_u32_e32 v4, s7, v3 +; GFX9-NEXT: v_subrev_u32_e32 v4, s5, v3 ; GFX9-NEXT: v_subrev_u32_e32 v1, s4, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX9-NEXT: v_subrev_u32_e32 v0, s5, v0 +; GFX9-NEXT: v_subrev_u32_e32 v0, s6, v0 ; GFX9-NEXT: v_xor_b32_e32 v3, s11, v3 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_subrev_u32_e32 v3, s11, v3 @@ -2505,7 +2505,7 @@ define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX10-LABEL: sdivrem_v2i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x10 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_bfe_i32 s1, s0, 0x80018 ; GFX10-NEXT: s_bfe_i32 s3, s0, 0x80010 @@ -2517,36 +2517,36 @@ define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: s_xor_b32 s3, s3, s8 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s1 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GFX10-NEXT: s_sub_i32 s4, 0, s1 +; GFX10-NEXT: s_sub_i32 s6, 0, s1 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX10-NEXT: v_mul_lo_u32 v2, s4, v0 -; GFX10-NEXT: s_sub_i32 s4, 0, s3 -; GFX10-NEXT: v_mul_lo_u32 v3, s4, v1 -; GFX10-NEXT: s_bfe_i32 s4, s0, 0x80008 +; GFX10-NEXT: v_mul_lo_u32 v2, s6, v0 +; GFX10-NEXT: s_sub_i32 s6, 0, s3 +; GFX10-NEXT: v_mul_lo_u32 v3, s6, v1 +; GFX10-NEXT: s_bfe_i32 s6, s0, 0x80008 ; GFX10-NEXT: s_sext_i32_i8 s0, s0 -; GFX10-NEXT: s_ashr_i32 s9, s4, 31 +; GFX10-NEXT: s_ashr_i32 s9, s6, 31 ; GFX10-NEXT: s_ashr_i32 s10, s0, 31 ; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX10-NEXT: s_add_i32 s4, s4, s9 +; GFX10-NEXT: s_add_i32 s6, s6, s9 ; GFX10-NEXT: s_add_i32 s0, s0, s10 ; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX10-NEXT: s_xor_b32 s4, s4, s9 +; GFX10-NEXT: s_xor_b32 s6, s6, s9 ; GFX10-NEXT: s_xor_b32 s0, s0, s10 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 -; GFX10-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX10-NEXT: v_mul_hi_u32 v0, s6, v0 ; GFX10-NEXT: v_mul_hi_u32 v1, s0, v1 ; GFX10-NEXT: v_mul_lo_u32 v2, v0, s1 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 ; GFX10-NEXT: v_mul_lo_u32 v3, v1, s3 ; GFX10-NEXT: v_add_nc_u32_e32 v6, 1, v1 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, s4, v2 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 +; GFX10-NEXT: v_sub_nc_u32_e32 v2, s6, v2 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, s0, v3 ; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s1, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v2 @@ -2596,25 +2596,25 @@ define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i16 %x, i16 %y) { ; GFX8-LABEL: sdiv_i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX8-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_i32 s0, s4, 0x100010 -; GFX8-NEXT: s_ashr_i32 s5, s0, 31 -; GFX8-NEXT: s_add_i32 s0, s0, s5 -; GFX8-NEXT: s_xor_b32 s8, s0, s5 +; GFX8-NEXT: s_bfe_i32 s0, s6, 0x100010 +; GFX8-NEXT: s_ashr_i32 s7, s0, 31 +; GFX8-NEXT: s_add_i32 s0, s0, s7 +; GFX8-NEXT: s_xor_b32 s8, s0, s7 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX8-NEXT: s_sub_i32 s0, 0, s8 -; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX8-NEXT: s_ashr_i32 s6, s4, 31 -; GFX8-NEXT: s_add_i32 s4, s4, s6 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: s_sext_i32_i16 s4, s6 +; GFX8-NEXT: s_ashr_i32 s5, s4, 31 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX8-NEXT: s_xor_b32 s4, s4, s6 -; GFX8-NEXT: s_xor_b32 s5, s6, s5 +; GFX8-NEXT: s_add_i32 s4, s4, s5 +; GFX8-NEXT: s_xor_b32 s4, s4, s5 +; GFX8-NEXT: s_xor_b32 s6, s5, s7 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -2631,52 +2631,52 @@ define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s8, v3 -; GFX8-NEXT: v_xor_b32_e32 v2, s5, v2 +; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s5, v2 -; GFX8-NEXT: v_xor_b32_e32 v3, s6, v3 +; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s6, v2 +; GFX8-NEXT: v_xor_b32_e32 v3, s5, v3 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s6, v3 +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s5, v3 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_short v[0:1], v3 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x10 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_i32 s1, s0, 0x100010 -; GFX9-NEXT: s_ashr_i32 s4, s1, 31 -; GFX9-NEXT: s_add_i32 s1, s1, s4 -; GFX9-NEXT: s_xor_b32 s5, s1, s4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX9-NEXT: s_sub_i32 s1, 0, s5 +; GFX9-NEXT: s_ashr_i32 s6, s1, 31 +; GFX9-NEXT: s_add_i32 s1, s1, s6 +; GFX9-NEXT: s_xor_b32 s7, s1, s6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX9-NEXT: s_sub_i32 s1, 0, s7 ; GFX9-NEXT: s_sext_i32_i16 s0, s0 ; GFX9-NEXT: s_ashr_i32 s8, s0, 31 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_add_i32 s0, s0, s8 ; GFX9-NEXT: s_xor_b32 s9, s0, s8 -; GFX9-NEXT: s_xor_b32 s4, s8, s4 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_xor_b32 s4, s8, s6 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, v0, s5 +; GFX9-NEXT: v_mul_lo_u32 v1, v0, s7 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 ; GFX9-NEXT: v_sub_u32_e32 v1, s9, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 ; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 @@ -2689,19 +2689,18 @@ define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou ; ; GFX10-LABEL: sdiv_i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x10 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_bfe_i32 s1, s0, 0x100010 ; GFX10-NEXT: s_sext_i32_i16 s0, s0 -; GFX10-NEXT: s_ashr_i32 s4, s1, 31 +; GFX10-NEXT: s_ashr_i32 s6, s1, 31 ; GFX10-NEXT: s_ashr_i32 s8, s0, 31 -; GFX10-NEXT: s_add_i32 s1, s1, s4 +; GFX10-NEXT: s_add_i32 s1, s1, s6 ; GFX10-NEXT: s_add_i32 s0, s0, s8 -; GFX10-NEXT: s_xor_b32 s5, s1, s4 +; GFX10-NEXT: s_xor_b32 s7, s1, s6 ; GFX10-NEXT: s_xor_b32 s0, s0, s8 -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX10-NEXT: s_sub_i32 s1, 0, s5 -; GFX10-NEXT: s_xor_b32 s4, s8, s4 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX10-NEXT: s_sub_i32 s1, 0, s7 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -2709,17 +2708,18 @@ define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou ; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX10-NEXT: v_mul_lo_u32 v1, v0, s5 +; GFX10-NEXT: v_mul_lo_u32 v1, v0, s7 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s0, v1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s5, v1 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s5, v1 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_xor_b32 s4, s8, s6 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s7, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s5, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s5, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s7, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -2741,14 +2741,14 @@ define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou define amdgpu_kernel void @sdivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i16> %x, <2 x i16> %y) { ; GFX8-LABEL: sdivrem_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x10 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_sext_i32_i16 s0, s3 ; GFX8-NEXT: s_ashr_i32 s8, s0, 31 ; GFX8-NEXT: s_add_i32 s0, s0, s8 ; GFX8-NEXT: s_xor_b32 s9, s0, s8 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GFX8-NEXT: s_sub_i32 s4, 0, s9 +; GFX8-NEXT: s_sub_i32 s6, 0, s9 ; GFX8-NEXT: s_bfe_i32 s1, s3, 0x100010 ; GFX8-NEXT: s_ashr_i32 s10, s1, 31 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -2760,10 +2760,10 @@ define amdgpu_kernel void @sdivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s11 ; GFX8-NEXT: s_ashr_i32 s3, s0, 31 ; GFX8-NEXT: s_add_i32 s0, s0, s3 -; GFX8-NEXT: v_mul_lo_u32 v1, s4, v0 +; GFX8-NEXT: v_mul_lo_u32 v1, s6, v0 ; GFX8-NEXT: s_xor_b32 s0, s0, s3 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0 @@ -2829,15 +2829,15 @@ define amdgpu_kernel void @sdivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX9-LABEL: sdivrem_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sext_i32_i16 s0, s5 +; GFX9-NEXT: s_sext_i32_i16 s0, s7 ; GFX9-NEXT: s_ashr_i32 s8, s0, 31 ; GFX9-NEXT: s_add_i32 s0, s0, s8 ; GFX9-NEXT: s_xor_b32 s9, s0, s8 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GFX9-NEXT: s_bfe_i32 s5, s5, 0x100010 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_bfe_i32 s5, s7, 0x100010 ; GFX9-NEXT: s_ashr_i32 s7, s5, 31 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_add_i32 s5, s5, s7 @@ -2847,27 +2847,27 @@ define amdgpu_kernel void @sdivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: s_sub_i32 s10, 0, s9 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: s_sext_i32_i16 s6, s4 +; GFX9-NEXT: s_sext_i32_i16 s4, s6 ; GFX9-NEXT: v_mul_lo_u32 v2, s10, v0 -; GFX9-NEXT: s_ashr_i32 s10, s6, 31 +; GFX9-NEXT: s_ashr_i32 s10, s4, 31 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX9-NEXT: s_add_i32 s6, s6, s10 -; GFX9-NEXT: s_xor_b32 s6, s6, s10 +; GFX9-NEXT: s_add_i32 s4, s4, s10 +; GFX9-NEXT: s_xor_b32 s4, s4, s10 ; GFX9-NEXT: s_sub_i32 s11, 0, s5 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v0, s6, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX9-NEXT: v_mul_lo_u32 v2, s11, v1 -; GFX9-NEXT: s_bfe_i32 s4, s4, 0x100010 -; GFX9-NEXT: s_ashr_i32 s11, s4, 31 +; GFX9-NEXT: s_bfe_i32 s6, s6, 0x100010 +; GFX9-NEXT: s_ashr_i32 s11, s6, 31 ; GFX9-NEXT: v_mul_lo_u32 v3, v0, s9 ; GFX9-NEXT: v_mul_hi_u32 v2, v1, v2 -; GFX9-NEXT: s_add_i32 s4, s4, s11 +; GFX9-NEXT: s_add_i32 s6, s6, s11 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 -; GFX9-NEXT: v_sub_u32_e32 v3, s6, v3 +; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 -; GFX9-NEXT: s_xor_b32 s4, s4, s11 +; GFX9-NEXT: s_xor_b32 s4, s6, s11 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX9-NEXT: v_subrev_u32_e32 v4, s9, v3 @@ -2912,7 +2912,7 @@ define amdgpu_kernel void @sdivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX10-LABEL: sdivrem_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_sext_i32_i16 s2, s1 ; GFX10-NEXT: s_bfe_i32 s1, s1, 0x100010 @@ -2924,36 +2924,36 @@ define amdgpu_kernel void @sdivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: s_xor_b32 s1, s1, s8 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s1 -; GFX10-NEXT: s_sub_i32 s4, 0, s2 +; GFX10-NEXT: s_sub_i32 s6, 0, s2 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX10-NEXT: v_mul_lo_u32 v2, s4, v0 -; GFX10-NEXT: s_sub_i32 s4, 0, s1 -; GFX10-NEXT: v_mul_lo_u32 v3, s4, v1 -; GFX10-NEXT: s_sext_i32_i16 s4, s0 +; GFX10-NEXT: v_mul_lo_u32 v2, s6, v0 +; GFX10-NEXT: s_sub_i32 s6, 0, s1 +; GFX10-NEXT: v_mul_lo_u32 v3, s6, v1 +; GFX10-NEXT: s_sext_i32_i16 s6, s0 ; GFX10-NEXT: s_bfe_i32 s0, s0, 0x100010 -; GFX10-NEXT: s_ashr_i32 s9, s4, 31 +; GFX10-NEXT: s_ashr_i32 s9, s6, 31 ; GFX10-NEXT: s_ashr_i32 s10, s0, 31 ; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX10-NEXT: s_add_i32 s4, s4, s9 +; GFX10-NEXT: s_add_i32 s6, s6, s9 ; GFX10-NEXT: s_add_i32 s0, s0, s10 ; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX10-NEXT: s_xor_b32 s4, s4, s9 +; GFX10-NEXT: s_xor_b32 s6, s6, s9 ; GFX10-NEXT: s_xor_b32 s0, s0, s10 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 -; GFX10-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX10-NEXT: v_mul_hi_u32 v0, s6, v0 ; GFX10-NEXT: v_mul_hi_u32 v1, s0, v1 ; GFX10-NEXT: v_mul_lo_u32 v2, v0, s2 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 ; GFX10-NEXT: v_mul_lo_u32 v3, v1, s1 ; GFX10-NEXT: v_add_nc_u32_e32 v6, 1, v1 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, s4, v2 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 +; GFX10-NEXT: v_sub_nc_u32_e32 v2, s6, v2 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, s0, v3 ; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s2, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2 @@ -3002,25 +3002,25 @@ define amdgpu_kernel void @sdivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 define amdgpu_kernel void @sdivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i3 %x, i3 %y) { ; GFX8-LABEL: sdivrem_i3: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX8-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_i32 s0, s4, 0x30008 -; GFX8-NEXT: s_ashr_i32 s5, s0, 31 -; GFX8-NEXT: s_add_i32 s0, s0, s5 -; GFX8-NEXT: s_xor_b32 s8, s0, s5 +; GFX8-NEXT: s_bfe_i32 s0, s6, 0x30008 +; GFX8-NEXT: s_ashr_i32 s7, s0, 31 +; GFX8-NEXT: s_add_i32 s0, s0, s7 +; GFX8-NEXT: s_xor_b32 s8, s0, s7 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX8-NEXT: s_sub_i32 s0, 0, s8 -; GFX8-NEXT: s_bfe_i32 s4, s4, 0x30000 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX8-NEXT: s_ashr_i32 s6, s4, 31 -; GFX8-NEXT: s_add_i32 s4, s4, s6 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: s_bfe_i32 s4, s6, 0x30000 +; GFX8-NEXT: s_ashr_i32 s5, s4, 31 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX8-NEXT: s_xor_b32 s4, s4, s6 -; GFX8-NEXT: s_xor_b32 s5, s6, s5 +; GFX8-NEXT: s_add_i32 s4, s4, s5 +; GFX8-NEXT: s_xor_b32 s4, s4, s5 +; GFX8-NEXT: s_xor_b32 s6, s5, s7 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -3037,12 +3037,12 @@ define amdgpu_kernel void @sdivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) % ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s8, v3 -; GFX8-NEXT: v_xor_b32_e32 v2, s5, v2 +; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s5, v2 -; GFX8-NEXT: v_xor_b32_e32 v3, s6, v3 +; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s6, v2 +; GFX8-NEXT: v_xor_b32_e32 v3, s5, v3 ; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s6, v3 +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s5, v3 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_and_b32_e32 v2, 7, v3 @@ -3052,39 +3052,39 @@ define amdgpu_kernel void @sdivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) % ; ; GFX9-LABEL: sdivrem_i3: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x10 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_i32 s1, s0, 0x30008 -; GFX9-NEXT: s_ashr_i32 s4, s1, 31 -; GFX9-NEXT: s_add_i32 s1, s1, s4 -; GFX9-NEXT: s_xor_b32 s5, s1, s4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX9-NEXT: s_sub_i32 s1, 0, s5 +; GFX9-NEXT: s_ashr_i32 s6, s1, 31 +; GFX9-NEXT: s_add_i32 s1, s1, s6 +; GFX9-NEXT: s_xor_b32 s7, s1, s6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX9-NEXT: s_sub_i32 s1, 0, s7 ; GFX9-NEXT: s_bfe_i32 s0, s0, 0x30000 ; GFX9-NEXT: s_ashr_i32 s8, s0, 31 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_add_i32 s0, s0, s8 ; GFX9-NEXT: s_xor_b32 s9, s0, s8 -; GFX9-NEXT: s_xor_b32 s4, s8, s4 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_xor_b32 s4, s8, s6 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, v0, s5 +; GFX9-NEXT: v_mul_lo_u32 v1, v0, s7 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 ; GFX9-NEXT: v_sub_u32_e32 v1, s9, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 ; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 @@ -3099,19 +3099,18 @@ define amdgpu_kernel void @sdivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) % ; ; GFX10-LABEL: sdivrem_i3: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x10 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_bfe_i32 s1, s0, 0x30008 ; GFX10-NEXT: s_bfe_i32 s0, s0, 0x30000 -; GFX10-NEXT: s_ashr_i32 s4, s1, 31 -; GFX10-NEXT: s_ashr_i32 s5, s0, 31 -; GFX10-NEXT: s_add_i32 s1, s1, s4 -; GFX10-NEXT: s_add_i32 s0, s0, s5 -; GFX10-NEXT: s_xor_b32 s1, s1, s4 -; GFX10-NEXT: s_xor_b32 s0, s0, s5 +; GFX10-NEXT: s_ashr_i32 s6, s1, 31 +; GFX10-NEXT: s_ashr_i32 s7, s0, 31 +; GFX10-NEXT: s_add_i32 s1, s1, s6 +; GFX10-NEXT: s_add_i32 s0, s0, s7 +; GFX10-NEXT: s_xor_b32 s1, s1, s6 +; GFX10-NEXT: s_xor_b32 s0, s0, s7 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s1 ; GFX10-NEXT: s_sub_i32 s2, 0, s1 -; GFX10-NEXT: s_xor_b32 s4, s5, s4 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -3129,14 +3128,15 @@ define amdgpu_kernel void @sdivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) % ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s1, v1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_xor_b32 s4, s7, s6 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_xor_b32_e32 v0, s4, v0 -; GFX10-NEXT: v_xor_b32_e32 v1, s5, v1 +; GFX10-NEXT: v_xor_b32_e32 v1, s7, v1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v0, s4, v0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s5, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s7, v1 ; GFX10-NEXT: v_and_b32_e32 v0, 7, v0 ; GFX10-NEXT: v_and_b32_e32 v1, 7, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -3153,25 +3153,25 @@ define amdgpu_kernel void @sdivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) % define amdgpu_kernel void @sdivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i27 %x, i27 %y) { ; GFX8-LABEL: sdivrem_i27: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_i32 s0, s5, 0x1b0000 -; GFX8-NEXT: s_ashr_i32 s5, s0, 31 -; GFX8-NEXT: s_add_i32 s0, s0, s5 -; GFX8-NEXT: s_xor_b32 s8, s0, s5 +; GFX8-NEXT: s_bfe_i32 s0, s7, 0x1b0000 +; GFX8-NEXT: s_ashr_i32 s7, s0, 31 +; GFX8-NEXT: s_add_i32 s0, s0, s7 +; GFX8-NEXT: s_xor_b32 s8, s0, s7 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX8-NEXT: s_sub_i32 s0, 0, s8 -; GFX8-NEXT: s_bfe_i32 s4, s4, 0x1b0000 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX8-NEXT: s_ashr_i32 s6, s4, 31 -; GFX8-NEXT: s_add_i32 s4, s4, s6 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: s_bfe_i32 s4, s6, 0x1b0000 +; GFX8-NEXT: s_ashr_i32 s5, s4, 31 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX8-NEXT: s_xor_b32 s4, s4, s6 -; GFX8-NEXT: s_xor_b32 s5, s6, s5 +; GFX8-NEXT: s_add_i32 s4, s4, s5 +; GFX8-NEXT: s_xor_b32 s4, s4, s5 +; GFX8-NEXT: s_xor_b32 s6, s5, s7 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -3188,12 +3188,12 @@ define amdgpu_kernel void @sdivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s8, v3 -; GFX8-NEXT: v_xor_b32_e32 v2, s5, v2 +; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s5, v2 -; GFX8-NEXT: v_xor_b32_e32 v3, s6, v3 +; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s6, v2 +; GFX8-NEXT: v_xor_b32_e32 v3, s5, v3 ; GFX8-NEXT: v_and_b32_e32 v2, 0x7ffffff, v2 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s6, v3 +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s5, v3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_and_b32_e32 v2, 0x7ffffff, v3 @@ -3203,39 +3203,39 @@ define amdgpu_kernel void @sdivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX9-LABEL: sdivrem_i27: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_i32 s1, s1, 0x1b0000 -; GFX9-NEXT: s_ashr_i32 s4, s1, 31 -; GFX9-NEXT: s_add_i32 s1, s1, s4 -; GFX9-NEXT: s_xor_b32 s5, s1, s4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX9-NEXT: s_sub_i32 s1, 0, s5 +; GFX9-NEXT: s_ashr_i32 s6, s1, 31 +; GFX9-NEXT: s_add_i32 s1, s1, s6 +; GFX9-NEXT: s_xor_b32 s7, s1, s6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX9-NEXT: s_sub_i32 s1, 0, s7 ; GFX9-NEXT: s_bfe_i32 s0, s0, 0x1b0000 ; GFX9-NEXT: s_ashr_i32 s8, s0, 31 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_add_i32 s0, s0, s8 ; GFX9-NEXT: s_xor_b32 s9, s0, s8 -; GFX9-NEXT: s_xor_b32 s4, s8, s4 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_xor_b32 s4, s8, s6 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, v0, s5 +; GFX9-NEXT: v_mul_lo_u32 v1, v0, s7 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 ; GFX9-NEXT: v_sub_u32_e32 v1, s9, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 ; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 @@ -3250,19 +3250,18 @@ define amdgpu_kernel void @sdivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX10-LABEL: sdivrem_i27: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_bfe_i32 s1, s1, 0x1b0000 ; GFX10-NEXT: s_bfe_i32 s0, s0, 0x1b0000 -; GFX10-NEXT: s_ashr_i32 s4, s1, 31 -; GFX10-NEXT: s_ashr_i32 s5, s0, 31 -; GFX10-NEXT: s_add_i32 s1, s1, s4 -; GFX10-NEXT: s_add_i32 s0, s0, s5 -; GFX10-NEXT: s_xor_b32 s1, s1, s4 -; GFX10-NEXT: s_xor_b32 s0, s0, s5 +; GFX10-NEXT: s_ashr_i32 s6, s1, 31 +; GFX10-NEXT: s_ashr_i32 s7, s0, 31 +; GFX10-NEXT: s_add_i32 s1, s1, s6 +; GFX10-NEXT: s_add_i32 s0, s0, s7 +; GFX10-NEXT: s_xor_b32 s1, s1, s6 +; GFX10-NEXT: s_xor_b32 s0, s0, s7 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s1 ; GFX10-NEXT: s_sub_i32 s2, 0, s1 -; GFX10-NEXT: s_xor_b32 s4, s5, s4 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -3280,14 +3279,15 @@ define amdgpu_kernel void @sdivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s1, v1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_xor_b32 s4, s7, s6 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_xor_b32_e32 v0, s4, v0 -; GFX10-NEXT: v_xor_b32_e32 v1, s5, v1 +; GFX10-NEXT: v_xor_b32_e32 v1, s7, v1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v0, s4, v0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s5, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s7, v1 ; GFX10-NEXT: v_and_b32_e32 v0, 0x7ffffff, v0 ; GFX10-NEXT: v_and_b32_e32 v1, 0x7ffffff, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll index 8b94f93e44e56..62d8b7d6f045c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll @@ -6,32 +6,32 @@ define amdgpu_kernel void @udivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i32 %x, i32 %y) { ; GFX8-LABEL: udivrem_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX8-NEXT: s_sub_i32 s0, 0, s5 +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX8-NEXT: s_sub_i32 s0, 0, s7 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 -; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 +; GFX8-NEXT: v_mul_hi_u32 v2, s6, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mul_lo_u32 v3, v2, s5 +; GFX8-NEXT: v_mul_lo_u32 v3, v2, s7 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s4, v3 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s6, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc @@ -41,30 +41,30 @@ define amdgpu_kernel void @udivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX9-LABEL: udivrem_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX9-NEXT: s_sub_i32 s0, 0, s5 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX9-NEXT: s_sub_i32 s0, 0, s7 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 -; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, v0, s5 +; GFX9-NEXT: v_mul_hi_u32 v0, s6, v0 +; GFX9-NEXT: v_mul_lo_u32 v1, v0, s7 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_sub_u32_e32 v1, s4, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 +; GFX9-NEXT: v_sub_u32_e32 v1, s6, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] @@ -73,28 +73,28 @@ define amdgpu_kernel void @udivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX10-LABEL: udivrem_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX10-NEXT: s_sub_i32 s0, 0, s5 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX10-NEXT: s_sub_i32 s0, 0, s7 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 -; GFX10-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX10-NEXT: v_mul_lo_u32 v1, v0, s5 +; GFX10-NEXT: v_mul_hi_u32 v0, s6, v0 +; GFX10-NEXT: v_mul_lo_u32 v1, v0, s7 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 -; GFX10-NEXT: v_sub_nc_u32_e32 v1, s4, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s5, v1 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s5, v1 +; GFX10-NEXT: v_sub_nc_u32_e32 v1, s6, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s7, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s5, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s5, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s7, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo @@ -112,7 +112,7 @@ define amdgpu_kernel void @udivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i64 %x, i64 %y) { ; GFX8-LABEL: udivrem_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s11 ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s10 @@ -251,7 +251,7 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX9-LABEL: udivrem_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s11 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s10 @@ -384,7 +384,7 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX10-LABEL: udivrem_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s11 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s10 @@ -522,7 +522,7 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i32> %x, <2 x i32> %y) { ; GFX8-LABEL: udivrem_v2i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s10 ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s11 @@ -576,12 +576,12 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX9-LABEL: udivrem_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s10 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s11 -; GFX9-NEXT: s_sub_i32 s0, 0, s10 -; GFX9-NEXT: s_sub_i32 s1, 0, s11 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s14 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s15 +; GFX9-NEXT: s_sub_i32 s0, 0, s14 +; GFX9-NEXT: s_sub_i32 s1, 0, s15 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -593,47 +593,47 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v0, s8, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, s12, v0 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 -; GFX9-NEXT: v_mul_hi_u32 v1, s9, v1 -; GFX9-NEXT: v_mul_lo_u32 v2, v0, s10 +; GFX9-NEXT: v_mul_hi_u32 v1, s13, v1 +; GFX9-NEXT: v_mul_lo_u32 v2, v0, s14 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 -; GFX9-NEXT: v_mul_lo_u32 v3, v1, s11 +; GFX9-NEXT: v_mul_lo_u32 v3, v1, s15 ; GFX9-NEXT: v_add_u32_e32 v5, 1, v1 -; GFX9-NEXT: v_sub_u32_e32 v2, s8, v2 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 -; GFX9-NEXT: v_sub_u32_e32 v3, s9, v3 +; GFX9-NEXT: v_sub_u32_e32 v2, s12, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s14, v2 +; GFX9-NEXT: v_sub_u32_e32 v3, s13, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s10, v2 -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v3 +; GFX9-NEXT: v_subrev_u32_e32 v4, s14, v2 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1] -; GFX9-NEXT: v_subrev_u32_e32 v5, s11, v3 +; GFX9-NEXT: v_subrev_u32_e32 v5, s15, v3 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s14, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s10, v2 +; GFX9-NEXT: v_subrev_u32_e32 v4, s14, v2 ; GFX9-NEXT: v_add_u32_e32 v5, 1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 -; GFX9-NEXT: v_subrev_u32_e32 v4, s11, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s15, v3 +; GFX9-NEXT: v_subrev_u32_e32 v4, s15, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] -; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9] +; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: udivrem_v2i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s10 -; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s11 -; GFX10-NEXT: s_sub_i32 s0, 0, s10 -; GFX10-NEXT: s_sub_i32 s1, 0, s11 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s14 +; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s15 +; GFX10-NEXT: s_sub_i32 s0, 0, s14 +; GFX10-NEXT: s_sub_i32 s1, 0, s15 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -646,34 +646,34 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 -; GFX10-NEXT: v_mul_hi_u32 v0, s8, v0 -; GFX10-NEXT: v_mul_hi_u32 v1, s9, v1 -; GFX10-NEXT: v_mul_lo_u32 v2, v0, s10 -; GFX10-NEXT: v_mul_lo_u32 v3, v1, s11 +; GFX10-NEXT: v_mul_hi_u32 v0, s12, v0 +; GFX10-NEXT: v_mul_hi_u32 v1, s13, v1 +; GFX10-NEXT: v_mul_lo_u32 v2, v0, s14 +; GFX10-NEXT: v_mul_lo_u32 v3, v1, s15 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 ; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, s8, v2 -; GFX10-NEXT: v_sub_nc_u32_e32 v3, s9, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s10, v2 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s11, v3 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s10, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s11, v3 +; GFX10-NEXT: v_sub_nc_u32_e32 v2, s12, v2 +; GFX10-NEXT: v_sub_nc_u32_e32 v3, s13, v3 +; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s14, v2 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s15, v3 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s14, v2 +; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s15, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s10, v2 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s11, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s10, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s11, v3 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s14, v2 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s15, v3 +; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s14, v2 +; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s15, v3 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo -; GFX10-NEXT: global_store_dwordx2 v8, v[0:1], s[4:5] -; GFX10-NEXT: global_store_dwordx2 v8, v[2:3], s[6:7] +; GFX10-NEXT: global_store_dwordx2 v8, v[0:1], s[8:9] +; GFX10-NEXT: global_store_dwordx2 v8, v[2:3], s[10:11] ; GFX10-NEXT: s_endpgm %div = udiv <2 x i32> %x, %y store <2 x i32> %div, ptr addrspace(1) %out0 @@ -685,8 +685,8 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <4 x i32> %x, <4 x i32> %y) { ; GFX8-LABEL: udivrem_v4i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s12 ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s13 @@ -783,7 +783,7 @@ define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX9-LABEL: udivrem_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s12 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s13 @@ -792,7 +792,6 @@ define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s14 -; GFX9-NEXT: s_sub_i32 s4, 0, s14 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -800,7 +799,8 @@ define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; GFX9-NEXT: v_mul_lo_u32 v2, s0, v0 ; GFX9-NEXT: v_mul_lo_u32 v3, s1, v1 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_sub_i32 s4, 0, s14 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 @@ -878,9 +878,9 @@ define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX10-LABEL: udivrem_v4i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s12 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s13 @@ -979,8 +979,8 @@ define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i64> %x, <2 x i64> %y) { ; GFX8-LABEL: udivrem_v2i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[12:15], s[6:7], 0x20 -; GFX8-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x20 +; GFX8-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s13 ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s12 @@ -1248,7 +1248,7 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX9-LABEL: udivrem_v2i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[12:15], s[6:7], 0x20 +; GFX9-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x20 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s13 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s12 @@ -1257,7 +1257,7 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX9-NEXT: v_trunc_f32_e32 v2, v1 @@ -1510,7 +1510,7 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX10-LABEL: udivrem_v2i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[12:15], s[6:7], 0x20 +; GFX10-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x20 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s13 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s15 @@ -1546,9 +1546,9 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_mad_u64_u32 v[5:6], s3, s2, v10, v[3:4] ; GFX10-NEXT: v_mul_lo_u32 v6, v9, v0 ; GFX10-NEXT: s_subb_u32 s3, 0, s15 -; GFX10-NEXT: v_mad_u64_u32 v[3:4], s4, s1, v7, v[4:5] +; GFX10-NEXT: v_mad_u64_u32 v[3:4], s6, s1, v7, v[4:5] ; GFX10-NEXT: v_mul_hi_u32 v4, v7, v0 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, s3, v8, v[5:6] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s6, s3, v8, v[5:6] ; GFX10-NEXT: v_mul_lo_u32 v1, v10, v2 ; GFX10-NEXT: v_mul_hi_u32 v5, v8, v2 ; GFX10-NEXT: v_mul_hi_u32 v2, v10, v2 @@ -1560,39 +1560,39 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_mul_hi_u32 v17, v8, v0 ; GFX10-NEXT: v_mul_hi_u32 v3, v9, v3 ; GFX10-NEXT: v_mul_hi_u32 v0, v10, v0 -; GFX10-NEXT: v_add_co_u32 v6, s4, v6, v12 -; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s4 -; GFX10-NEXT: v_add_co_u32 v11, s4, v13, v11 -; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s4 -; GFX10-NEXT: v_add_co_u32 v1, s4, v1, v15 -; GFX10-NEXT: v_cndmask_b32_e64 v15, 0, 1, s4 -; GFX10-NEXT: v_add_co_u32 v2, s4, v16, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v16, 0, 1, s4 -; GFX10-NEXT: v_add_co_u32 v4, s4, v6, v4 -; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4 -; GFX10-NEXT: v_add_co_u32 v6, s4, v11, v14 -; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s4 -; GFX10-NEXT: v_add_co_u32 v1, s4, v1, v5 +; GFX10-NEXT: v_add_co_u32 v6, s6, v6, v12 +; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s6 +; GFX10-NEXT: v_add_co_u32 v11, s6, v13, v11 +; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s6 +; GFX10-NEXT: v_add_co_u32 v1, s6, v1, v15 +; GFX10-NEXT: v_cndmask_b32_e64 v15, 0, 1, s6 +; GFX10-NEXT: v_add_co_u32 v2, s6, v16, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v16, 0, 1, s6 +; GFX10-NEXT: v_add_co_u32 v4, s6, v6, v4 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s6 +; GFX10-NEXT: v_add_co_u32 v6, s6, v11, v14 +; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s6 +; GFX10-NEXT: v_add_co_u32 v1, s6, v1, v5 ; GFX10-NEXT: v_add_nc_u32_e32 v4, v12, v4 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s4 -; GFX10-NEXT: v_add_co_u32 v2, s4, v2, v17 -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s4 -; GFX10-NEXT: v_add_co_u32 v4, s4, v6, v4 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s6 +; GFX10-NEXT: v_add_co_u32 v2, s6, v2, v17 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s6 +; GFX10-NEXT: v_add_co_u32 v4, s6, v6, v4 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v15, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v11, v13, v11 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s6 ; GFX10-NEXT: v_add_nc_u32_e32 v5, v16, v5 ; GFX10-NEXT: v_add_co_u32 v7, vcc_lo, v7, v4 -; GFX10-NEXT: v_add_co_u32 v1, s4, v2, v1 +; GFX10-NEXT: v_add_co_u32 v1, s6, v2, v1 ; GFX10-NEXT: v_add3_u32 v3, v11, v6, v3 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s6 ; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v3, vcc_lo ; GFX10-NEXT: v_add3_u32 v2, v5, v2, v0 ; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v8, v1 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, s0, v7, 0 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s6, s0, v7, 0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v10, v2, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[2:3], s4, s2, v8, 0 -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 +; GFX10-NEXT: v_mad_u64_u32 v[2:3], s6, s2, v8, 0 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 ; GFX10-NEXT: v_mul_hi_u32 v11, v9, v0 ; GFX10-NEXT: v_mad_u64_u32 v[4:5], s0, s0, v9, v[1:2] ; GFX10-NEXT: v_mad_u64_u32 v[5:6], s0, s2, v10, v[3:4] @@ -1772,34 +1772,34 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i8 %x, i8 %y) { ; GFX8-LABEL: udiv_i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX8-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s5, s4, 0x80008 -; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 +; GFX8-NEXT: s_bfe_u32 s7, s6, 0x80008 +; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s7 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX8-NEXT: s_sub_i32 s0, 0, s5 -; GFX8-NEXT: s_and_b32 s4, s4, 0xff +; GFX8-NEXT: s_sub_i32 s0, 0, s7 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: s_and_b32 s4, s6, 0xff ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mul_lo_u32 v3, v2, s5 +; GFX8-NEXT: v_mul_lo_u32 v3, v2, s7 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s4, v3 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc @@ -1809,32 +1809,32 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out ; ; GFX9-LABEL: udiv_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x10 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s4, s0, 0x80008 -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s4 +; GFX9-NEXT: s_bfe_u32 s6, s0, 0x80008 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s6 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_i32 s1, 0, s4 -; GFX9-NEXT: s_and_b32 s5, s0, 0xff +; GFX9-NEXT: s_sub_i32 s1, 0, s6 +; GFX9-NEXT: s_and_b32 s7, s0, 0xff ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 -; GFX9-NEXT: v_mul_hi_u32 v0, s5, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, v0, s4 +; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 +; GFX9-NEXT: v_mul_lo_u32 v1, v0, s6 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v1 +; GFX9-NEXT: v_sub_u32_e32 v1, s7, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s6, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s6, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_byte v2, v0, s[0:1] @@ -1843,12 +1843,12 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out ; ; GFX10-LABEL: udiv_i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x10 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bfe_u32 s4, s0, 0x80008 +; GFX10-NEXT: s_bfe_u32 s6, s0, 0x80008 ; GFX10-NEXT: s_and_b32 s0, s0, 0xff -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, s4 -; GFX10-NEXT: s_sub_i32 s1, 0, s4 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, s6 +; GFX10-NEXT: s_sub_i32 s1, 0, s6 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -1856,17 +1856,17 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out ; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX10-NEXT: v_mul_lo_u32 v1, v0, s4 +; GFX10-NEXT: v_mul_lo_u32 v1, v0, s6 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s0, v1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s4, v1 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v1 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s6, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s4, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s6, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo @@ -1884,8 +1884,8 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i8> %x, <2 x i8> %y) { ; GFX8-LABEL: udivrem_v2i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s0, s[6:7], 0x10 -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x10 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_u32 s2, s0, 0x80010 ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 @@ -1949,55 +1949,55 @@ define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX9-LABEL: udivrem_v2i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x10 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s4, s0, 0x80010 -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, s4 +; GFX9-NEXT: s_bfe_u32 s6, s0, 0x80010 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, s6 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v0, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_i32 s1, 0, s4 +; GFX9-NEXT: s_sub_i32 s1, 0, s6 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_lshr_b32 s5, s0, 24 +; GFX9-NEXT: s_lshr_b32 s7, s0, 24 ; GFX9-NEXT: v_mul_lo_u32 v3, s1, v1 -; GFX9-NEXT: s_sub_i32 s2, 0, s5 +; GFX9-NEXT: s_sub_i32 s2, 0, s7 ; GFX9-NEXT: v_mul_lo_u32 v2, s2, v0 ; GFX9-NEXT: s_and_b32 s8, s0, 0xff ; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX9-NEXT: s_bfe_u32 s9, s0, 0x80008 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 ; GFX9-NEXT: v_mul_hi_u32 v1, s8, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 ; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0 -; GFX9-NEXT: v_mul_lo_u32 v3, v1, s4 +; GFX9-NEXT: v_mul_lo_u32 v3, v1, s6 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 -; GFX9-NEXT: v_mul_lo_u32 v2, v0, s5 +; GFX9-NEXT: v_mul_lo_u32 v2, v0, s7 ; GFX9-NEXT: v_sub_u32_e32 v3, s8, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s4, v3 +; GFX9-NEXT: v_subrev_u32_e32 v4, s6, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 ; GFX9-NEXT: v_sub_u32_e32 v2, s9, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s4, v3 +; GFX9-NEXT: v_subrev_u32_e32 v4, s6, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s5, v2 +; GFX9-NEXT: v_subrev_u32_e32 v4, s7, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX9-NEXT: v_subrev_u32_e32 v4, s5, v2 +; GFX9-NEXT: v_subrev_u32_e32 v4, s7, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -2012,7 +2012,7 @@ define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX10-LABEL: udivrem_v2i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x10 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v0, s0 ; GFX10-NEXT: s_bfe_u32 s1, s0, 0x80010 @@ -2020,7 +2020,7 @@ define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, s1 ; GFX10-NEXT: s_sub_i32 s3, 0, s2 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX10-NEXT: s_sub_i32 s4, 0, s1 +; GFX10-NEXT: s_sub_i32 s6, 0, s1 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 @@ -2029,8 +2029,8 @@ define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_mul_lo_u32 v2, s3, v0 ; GFX10-NEXT: s_bfe_u32 s3, s0, 0x80008 ; GFX10-NEXT: s_and_b32 s0, s0, 0xff -; GFX10-NEXT: v_mul_lo_u32 v3, s4, v1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 +; GFX10-NEXT: v_mul_lo_u32 v3, s6, v1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 @@ -2081,34 +2081,34 @@ define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i16 %x, i16 %y) { ; GFX8-LABEL: udiv_i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX8-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshr_b32 s5, s4, 16 -; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX8-NEXT: s_sub_i32 s0, 0, s5 -; GFX8-NEXT: s_and_b32 s4, s4, 0xffff +; GFX8-NEXT: s_lshr_b32 s7, s6, 16 +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX8-NEXT: s_sub_i32 s0, 0, s7 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: s_and_b32 s4, s6, 0xffff ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mul_lo_u32 v3, v2, s5 +; GFX8-NEXT: v_mul_lo_u32 v3, v2, s7 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s4, v3 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc @@ -2118,32 +2118,32 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou ; ; GFX9-LABEL: udiv_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x10 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s4, s0, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX9-NEXT: s_sub_i32 s1, 0, s4 -; GFX9-NEXT: s_and_b32 s5, s0, 0xffff +; GFX9-NEXT: s_lshr_b32 s6, s0, 16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX9-NEXT: s_sub_i32 s1, 0, s6 +; GFX9-NEXT: s_and_b32 s7, s0, 0xffff ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 -; GFX9-NEXT: v_mul_hi_u32 v0, s5, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, v0, s4 +; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 +; GFX9-NEXT: v_mul_lo_u32 v1, v0, s6 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v1 +; GFX9-NEXT: v_sub_u32_e32 v1, s7, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s6, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s6, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v2, v0, s[0:1] @@ -2152,12 +2152,12 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou ; ; GFX10-LABEL: udiv_i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x10 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10-NEXT: s_lshr_b32 s6, s0, 16 ; GFX10-NEXT: s_and_b32 s0, s0, 0xffff -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX10-NEXT: s_sub_i32 s1, 0, s4 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX10-NEXT: s_sub_i32 s1, 0, s6 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -2165,17 +2165,17 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou ; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX10-NEXT: v_mul_lo_u32 v1, v0, s4 +; GFX10-NEXT: v_mul_lo_u32 v1, v0, s6 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s0, v1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s4, v1 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v1 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s6, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s4, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s6, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo @@ -2193,8 +2193,8 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou define amdgpu_kernel void @udivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i16> %x, <2 x i16> %y) { ; GFX8-LABEL: udivrem_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_and_b32 s2, s1, 0xffff ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s2 @@ -2258,7 +2258,7 @@ define amdgpu_kernel void @udivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX9-LABEL: udivrem_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s3, s1, 0xffff ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 @@ -2266,7 +2266,7 @@ define amdgpu_kernel void @udivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2 ; GFX9-NEXT: s_sub_i32 s1, 0, s3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_i32 s4, 0, s2 +; GFX9-NEXT: s_sub_i32 s6, 0, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -2274,10 +2274,10 @@ define amdgpu_kernel void @udivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_lo_u32 v2, s1, v0 ; GFX9-NEXT: s_lshr_b32 s1, s0, 16 -; GFX9-NEXT: v_mul_lo_u32 v3, s4, v1 +; GFX9-NEXT: v_mul_lo_u32 v3, s6, v1 ; GFX9-NEXT: s_and_b32 s0, s0, 0xffff ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 ; GFX9-NEXT: v_mul_hi_u32 v0, s0, v0 @@ -2319,14 +2319,14 @@ define amdgpu_kernel void @udivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX10-LABEL: udivrem_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s2, s1, 0xffff ; GFX10-NEXT: s_lshr_b32 s1, s1, 16 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s1 ; GFX10-NEXT: s_sub_i32 s3, 0, s2 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -2387,34 +2387,34 @@ define amdgpu_kernel void @udivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 define amdgpu_kernel void @udivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i3 %x, i3 %y) { ; GFX8-LABEL: udivrem_i3: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX8-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s5, s4, 0x30008 -; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 +; GFX8-NEXT: s_bfe_u32 s7, s6, 0x30008 +; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s7 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX8-NEXT: s_sub_i32 s0, 0, s5 -; GFX8-NEXT: s_and_b32 s4, s4, 7 +; GFX8-NEXT: s_sub_i32 s0, 0, s7 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: s_and_b32 s4, s6, 7 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mul_lo_u32 v3, v2, s5 +; GFX8-NEXT: v_mul_lo_u32 v3, v2, s7 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s4, v3 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 ; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: flat_store_byte v[0:1], v2 @@ -2426,32 +2426,32 @@ define amdgpu_kernel void @udivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) % ; ; GFX9-LABEL: udivrem_i3: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x10 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s4, s0, 0x30008 -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s4 +; GFX9-NEXT: s_bfe_u32 s6, s0, 0x30008 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s6 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_i32 s1, 0, s4 -; GFX9-NEXT: s_and_b32 s5, s0, 7 +; GFX9-NEXT: s_sub_i32 s1, 0, s6 +; GFX9-NEXT: s_and_b32 s7, s0, 7 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 -; GFX9-NEXT: v_mul_hi_u32 v0, s5, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, v0, s4 +; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 +; GFX9-NEXT: v_mul_lo_u32 v1, v0, s6 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v1 +; GFX9-NEXT: v_sub_u32_e32 v1, s7, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s6, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s6, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2462,12 +2462,12 @@ define amdgpu_kernel void @udivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) % ; ; GFX10-LABEL: udivrem_i3: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x10 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bfe_u32 s4, s0, 0x30008 +; GFX10-NEXT: s_bfe_u32 s6, s0, 0x30008 ; GFX10-NEXT: s_and_b32 s0, s0, 7 -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, s4 -; GFX10-NEXT: s_sub_i32 s1, 0, s4 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, s6 +; GFX10-NEXT: s_sub_i32 s1, 0, s6 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -2475,17 +2475,17 @@ define amdgpu_kernel void @udivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) % ; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX10-NEXT: v_mul_lo_u32 v1, v0, s4 +; GFX10-NEXT: v_mul_lo_u32 v1, v0, s6 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s0, v1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s4, v1 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v1 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s6, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s4, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s6, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -2505,34 +2505,34 @@ define amdgpu_kernel void @udivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) % define amdgpu_kernel void @udivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i27 %x, i27 %y) { ; GFX8-LABEL: udivrem_i27: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s5, s5, 0x7ffffff -; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX8-NEXT: s_sub_i32 s0, 0, s5 -; GFX8-NEXT: s_and_b32 s4, s4, 0x7ffffff +; GFX8-NEXT: s_and_b32 s7, s7, 0x7ffffff +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX8-NEXT: s_sub_i32 s0, 0, s7 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: s_and_b32 s4, s6, 0x7ffffff ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mul_lo_u32 v3, v2, s5 +; GFX8-NEXT: v_mul_lo_u32 v3, v2, s7 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s4, v3 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 ; GFX8-NEXT: v_and_b32_e32 v2, 0x7ffffff, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -2544,32 +2544,32 @@ define amdgpu_kernel void @udivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX9-LABEL: udivrem_i27: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s4, s1, 0x7ffffff -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX9-NEXT: s_sub_i32 s1, 0, s4 -; GFX9-NEXT: s_and_b32 s5, s0, 0x7ffffff +; GFX9-NEXT: s_and_b32 s6, s1, 0x7ffffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX9-NEXT: s_sub_i32 s1, 0, s6 +; GFX9-NEXT: s_and_b32 s7, s0, 0x7ffffff ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 -; GFX9-NEXT: v_mul_hi_u32 v0, s5, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, v0, s4 +; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 +; GFX9-NEXT: v_mul_lo_u32 v1, v0, s6 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v1 +; GFX9-NEXT: v_sub_u32_e32 v1, s7, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s6, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s6, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 0x7ffffff, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2580,12 +2580,12 @@ define amdgpu_kernel void @udivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX10-LABEL: udivrem_i27: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_and_b32 s4, s1, 0x7ffffff +; GFX10-NEXT: s_and_b32 s6, s1, 0x7ffffff ; GFX10-NEXT: s_and_b32 s0, s0, 0x7ffffff -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX10-NEXT: s_sub_i32 s1, 0, s4 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX10-NEXT: s_sub_i32 s1, 0, s6 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -2593,17 +2593,17 @@ define amdgpu_kernel void @udivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX10-NEXT: v_mul_lo_u32 v1, v0, s4 +; GFX10-NEXT: v_mul_lo_u32 v1, v0, s6 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s0, v1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s4, v1 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v1 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s6, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s4, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s6, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 0 diff --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll index b751be51a9739..611a7b566070c 100644 --- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll @@ -9,8 +9,8 @@ define amdgpu_kernel void @v_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; VI-LABEL: v_test_add_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -33,13 +33,13 @@ define amdgpu_kernel void @v_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-LABEL: v_test_add_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_pk_add_u16 v1, v1, v2 @@ -49,13 +49,13 @@ define amdgpu_kernel void @v_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace ; GFX10-LABEL: v_test_add_v2i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_pk_add_u16 v1, v1, v2 @@ -65,12 +65,10 @@ define amdgpu_kernel void @v_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace ; GFX11-LABEL: v_test_add_v2i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -95,8 +93,8 @@ define amdgpu_kernel void @v_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @s_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in0, ptr addrspace(4) %in1) #1 { ; VI-LABEL: s_test_add_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[6:7], 0x0 ; VI-NEXT: s_load_dword s0, s[0:1], 0x0 @@ -116,37 +114,37 @@ define amdgpu_kernel void @s_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-LABEL: s_test_add_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_pk_add_u16 v1, s3, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_pk_add_u16 v1, s1, v1 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_test_add_v2i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX10-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_add_u16 v1, s2, s3 +; GFX10-NEXT: v_pk_add_u16 v1, s0, s1 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_add_v2i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[6:7], 0x0 @@ -167,7 +165,7 @@ define amdgpu_kernel void @s_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @s_test_add_self_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in0) #1 { ; VI-LABEL: s_test_add_self_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -186,7 +184,7 @@ define amdgpu_kernel void @s_test_add_self_v2i16(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: s_test_add_self_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -197,7 +195,7 @@ define amdgpu_kernel void @s_test_add_self_v2i16(ptr addrspace(1) %out, ptr addr ; ; GFX10-LABEL: s_test_add_self_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -208,7 +206,7 @@ define amdgpu_kernel void @s_test_add_self_v2i16(ptr addrspace(1) %out, ptr addr ; ; GFX11-LABEL: s_test_add_self_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 @@ -228,7 +226,7 @@ define amdgpu_kernel void @s_test_add_self_v2i16(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @s_test_add_v2i16_kernarg(ptr addrspace(1) %out, <2 x i16> %a, <2 x i16> %b) #1 { ; VI-LABEL: s_test_add_v2i16_kernarg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s4, s2, 16 ; VI-NEXT: s_lshr_b32 s5, s3, 16 @@ -245,26 +243,26 @@ define amdgpu_kernel void @s_test_add_v2i16_kernarg(ptr addrspace(1) %out, <2 x ; ; GFX9-LABEL: s_test_add_v2i16_kernarg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_pk_add_u16 v1, s2, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_pk_add_u16 v1, s6, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_test_add_v2i16_kernarg: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_add_u16 v1, s2, s3 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: v_pk_add_u16 v1, s6, s7 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_add_v2i16_kernarg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_add_u16 v1, s2, s3 @@ -281,7 +279,7 @@ define amdgpu_kernel void @s_test_add_v2i16_kernarg(ptr addrspace(1) %out, <2 x define amdgpu_kernel void @v_test_add_v2i16_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { ; VI-LABEL: v_test_add_v2i16_constant: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: v_mov_b32_e32 v3, 0x1c8 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -300,7 +298,7 @@ define amdgpu_kernel void @v_test_add_v2i16_constant(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: v_test_add_v2i16_constant: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -313,7 +311,7 @@ define amdgpu_kernel void @v_test_add_v2i16_constant(ptr addrspace(1) %out, ptr ; ; GFX10-LABEL: v_test_add_v2i16_constant: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -325,10 +323,8 @@ define amdgpu_kernel void @v_test_add_v2i16_constant(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: v_test_add_v2i16_constant: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -350,7 +346,7 @@ define amdgpu_kernel void @v_test_add_v2i16_constant(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_test_add_v2i16_neg_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { ; VI-LABEL: v_test_add_v2i16_neg_constant: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: v_mov_b32_e32 v3, 0xfffffc21 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -369,7 +365,7 @@ define amdgpu_kernel void @v_test_add_v2i16_neg_constant(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_test_add_v2i16_neg_constant: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -382,7 +378,7 @@ define amdgpu_kernel void @v_test_add_v2i16_neg_constant(ptr addrspace(1) %out, ; ; GFX10-LABEL: v_test_add_v2i16_neg_constant: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -394,10 +390,8 @@ define amdgpu_kernel void @v_test_add_v2i16_neg_constant(ptr addrspace(1) %out, ; ; GFX11-LABEL: v_test_add_v2i16_neg_constant: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -418,7 +412,7 @@ define amdgpu_kernel void @v_test_add_v2i16_neg_constant(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { ; VI-LABEL: v_test_add_v2i16_inline_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: v_mov_b32_e32 v3, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -437,7 +431,7 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(ptr addrspace(1) %out, p ; ; GFX9-LABEL: v_test_add_v2i16_inline_neg1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -449,7 +443,7 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(ptr addrspace(1) %out, p ; ; GFX10-LABEL: v_test_add_v2i16_inline_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -461,10 +455,8 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(ptr addrspace(1) %out, p ; ; GFX11-LABEL: v_test_add_v2i16_inline_neg1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -485,7 +477,7 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(ptr addrspace(1) %out, p define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { ; VI-LABEL: v_test_add_v2i16_inline_lo_zero_hi: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -503,7 +495,7 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(ptr addrspace(1) % ; ; GFX9-LABEL: v_test_add_v2i16_inline_lo_zero_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -515,7 +507,7 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(ptr addrspace(1) % ; ; GFX10-LABEL: v_test_add_v2i16_inline_lo_zero_hi: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -527,10 +519,8 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(ptr addrspace(1) % ; ; GFX11-LABEL: v_test_add_v2i16_inline_lo_zero_hi: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -552,7 +542,7 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(ptr addrspace(1) % define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { ; VI-LABEL: v_test_add_v2i16_inline_fp_split: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: v_mov_b32_e32 v3, 0x3f80 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -570,7 +560,7 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_test_add_v2i16_inline_fp_split: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -582,7 +572,7 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(ptr addrspace(1) %ou ; ; GFX10-LABEL: v_test_add_v2i16_inline_fp_split: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -594,10 +584,8 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(ptr addrspace(1) %ou ; ; GFX11-LABEL: v_test_add_v2i16_inline_fp_split: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -619,8 +607,8 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(ptr addrspace(1) %ou define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; VI-LABEL: v_test_add_v2i16_zext_to_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -642,14 +630,14 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_test_add_v2i16_zext_to_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_add_u16 v0, v1, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 @@ -660,13 +648,13 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(ptr addrspace(1) %out, ; GFX10-LABEL: v_test_add_v2i16_zext_to_v2i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_add_u16 v0, v1, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -678,12 +666,10 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(ptr addrspace(1) %out, ; GFX11-LABEL: v_test_add_v2i16_zext_to_v2i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -713,8 +699,8 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; VI-LABEL: v_test_add_v2i16_zext_to_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -738,14 +724,14 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_test_add_v2i16_zext_to_v2i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[0:1] glc +; GFX9-NEXT: global_load_dword v3, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_add_u16 v0, v2, v3 ; GFX9-NEXT: v_alignbit_b32 v2, 0, v0, 16 @@ -757,13 +743,13 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(ptr addrspace(1) %out, ; GFX10-LABEL: v_test_add_v2i16_zext_to_v2i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_add_u16 v0, v1, v2 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -776,10 +762,8 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(ptr addrspace(1) %out, ; GFX11-LABEL: v_test_add_v2i16_zext_to_v2i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -811,8 +795,8 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; VI-LABEL: v_test_add_v2i16_sext_to_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -836,14 +820,14 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i32(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_test_add_v2i16_sext_to_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_add_u16 v0, v1, v2 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 16, v0 @@ -854,13 +838,13 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i32(ptr addrspace(1) %out, ; GFX10-LABEL: v_test_add_v2i16_sext_to_v2i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_add_u16 v0, v1, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -872,12 +856,10 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i32(ptr addrspace(1) %out, ; GFX11-LABEL: v_test_add_v2i16_sext_to_v2i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -907,8 +889,8 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i32(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; VI-LABEL: v_test_add_v2i16_sext_to_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -933,13 +915,13 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i64(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_test_add_v2i16_sext_to_v2i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_add_u16 v1, v1, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1 @@ -953,14 +935,14 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i64(ptr addrspace(1) %out, ; GFX10-LABEL: v_test_add_v2i16_sext_to_v2i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_add_u16 v0, v1, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 @@ -974,12 +956,10 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i64(ptr addrspace(1) %out, ; GFX11-LABEL: v_test_add_v2i16_sext_to_v2i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v4, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll index 7cf18171a6cd7..559871d162e13 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -39,7 +39,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX6-LABEL: udiv_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -72,31 +72,31 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX9-LABEL: udiv_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX9-NEXT: s_sub_i32 s4, 0, s3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX9-NEXT: s_sub_i32 s0, 0, s7 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s5, v0 -; GFX9-NEXT: s_mul_i32 s4, s4, s5 -; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4 -; GFX9-NEXT: s_add_i32 s5, s5, s4 -; GFX9-NEXT: s_mul_hi_u32 s4, s2, s5 -; GFX9-NEXT: s_mul_i32 s5, s4, s3 -; GFX9-NEXT: s_sub_i32 s2, s2, s5 -; GFX9-NEXT: s_add_i32 s6, s4, 1 -; GFX9-NEXT: s_sub_i32 s5, s2, s3 -; GFX9-NEXT: s_cmp_ge_u32 s2, s3 -; GFX9-NEXT: s_cselect_b32 s4, s6, s4 -; GFX9-NEXT: s_cselect_b32 s2, s5, s2 -; GFX9-NEXT: s_add_i32 s5, s4, 1 -; GFX9-NEXT: s_cmp_ge_u32 s2, s3 -; GFX9-NEXT: s_cselect_b32 s2, s5, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: v_readfirstlane_b32 s1, v0 +; GFX9-NEXT: s_mul_i32 s0, s0, s1 +; GFX9-NEXT: s_mul_hi_u32 s0, s1, s0 +; GFX9-NEXT: s_add_i32 s1, s1, s0 +; GFX9-NEXT: s_mul_hi_u32 s0, s6, s1 +; GFX9-NEXT: s_mul_i32 s1, s0, s7 +; GFX9-NEXT: s_sub_i32 s1, s6, s1 +; GFX9-NEXT: s_add_i32 s2, s0, 1 +; GFX9-NEXT: s_sub_i32 s3, s1, s7 +; GFX9-NEXT: s_cmp_ge_u32 s1, s7 +; GFX9-NEXT: s_cselect_b32 s0, s2, s0 +; GFX9-NEXT: s_cselect_b32 s1, s3, s1 +; GFX9-NEXT: s_add_i32 s2, s0, 1 +; GFX9-NEXT: s_cmp_ge_u32 s1, s7 +; GFX9-NEXT: s_cselect_b32 s0, s2, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-NEXT: s_endpgm %r = udiv i32 %x, %y store i32 %r, ptr addrspace(1) %out @@ -137,7 +137,7 @@ define amdgpu_kernel void @urem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX6-LABEL: urem_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -167,29 +167,29 @@ define amdgpu_kernel void @urem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX9-LABEL: urem_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX9-NEXT: s_sub_i32 s4, 0, s3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX9-NEXT: s_sub_i32 s0, 0, s7 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s5, v0 -; GFX9-NEXT: s_mul_i32 s4, s4, s5 -; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4 -; GFX9-NEXT: s_add_i32 s5, s5, s4 -; GFX9-NEXT: s_mul_hi_u32 s4, s2, s5 -; GFX9-NEXT: s_mul_i32 s4, s4, s3 -; GFX9-NEXT: s_sub_i32 s2, s2, s4 -; GFX9-NEXT: s_sub_i32 s4, s2, s3 -; GFX9-NEXT: s_cmp_ge_u32 s2, s3 -; GFX9-NEXT: s_cselect_b32 s2, s4, s2 -; GFX9-NEXT: s_sub_i32 s4, s2, s3 -; GFX9-NEXT: s_cmp_ge_u32 s2, s3 -; GFX9-NEXT: s_cselect_b32 s2, s4, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: v_readfirstlane_b32 s1, v0 +; GFX9-NEXT: s_mul_i32 s0, s0, s1 +; GFX9-NEXT: s_mul_hi_u32 s0, s1, s0 +; GFX9-NEXT: s_add_i32 s1, s1, s0 +; GFX9-NEXT: s_mul_hi_u32 s0, s6, s1 +; GFX9-NEXT: s_mul_i32 s0, s0, s7 +; GFX9-NEXT: s_sub_i32 s0, s6, s0 +; GFX9-NEXT: s_sub_i32 s1, s0, s7 +; GFX9-NEXT: s_cmp_ge_u32 s0, s7 +; GFX9-NEXT: s_cselect_b32 s0, s1, s0 +; GFX9-NEXT: s_sub_i32 s1, s0, s7 +; GFX9-NEXT: s_cmp_ge_u32 s0, s7 +; GFX9-NEXT: s_cselect_b32 s0, s1, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-NEXT: s_endpgm %r = urem i32 %x, %y store i32 %r, ptr addrspace(1) %out @@ -241,7 +241,7 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX6-LABEL: sdiv_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -280,37 +280,37 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX9-LABEL: sdiv_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_abs_i32 s4, s3 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX9-NEXT: s_sub_i32 s5, 0, s4 -; GFX9-NEXT: s_xor_b32 s3, s2, s3 -; GFX9-NEXT: s_abs_i32 s2, s2 +; GFX9-NEXT: s_abs_i32 s0, s7 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX9-NEXT: s_xor_b32 s1, s6, s7 +; GFX9-NEXT: s_abs_i32 s2, s6 +; GFX9-NEXT: s_sub_i32 s3, 0, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_ashr_i32 s3, s3, 31 +; GFX9-NEXT: s_ashr_i32 s1, s1, 31 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s6, v0 -; GFX9-NEXT: s_mul_i32 s5, s5, s6 -; GFX9-NEXT: s_mul_hi_u32 s5, s6, s5 -; GFX9-NEXT: s_add_i32 s6, s6, s5 -; GFX9-NEXT: s_mul_hi_u32 s5, s2, s6 -; GFX9-NEXT: s_mul_i32 s6, s5, s4 +; GFX9-NEXT: s_mul_i32 s3, s3, s6 +; GFX9-NEXT: s_mul_hi_u32 s3, s6, s3 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_mul_hi_u32 s3, s2, s6 +; GFX9-NEXT: s_mul_i32 s6, s3, s0 ; GFX9-NEXT: s_sub_i32 s2, s2, s6 -; GFX9-NEXT: s_add_i32 s7, s5, 1 -; GFX9-NEXT: s_sub_i32 s6, s2, s4 -; GFX9-NEXT: s_cmp_ge_u32 s2, s4 -; GFX9-NEXT: s_cselect_b32 s5, s7, s5 +; GFX9-NEXT: s_add_i32 s7, s3, 1 +; GFX9-NEXT: s_sub_i32 s6, s2, s0 +; GFX9-NEXT: s_cmp_ge_u32 s2, s0 +; GFX9-NEXT: s_cselect_b32 s3, s7, s3 ; GFX9-NEXT: s_cselect_b32 s2, s6, s2 -; GFX9-NEXT: s_add_i32 s6, s5, 1 -; GFX9-NEXT: s_cmp_ge_u32 s2, s4 -; GFX9-NEXT: s_cselect_b32 s2, s6, s5 -; GFX9-NEXT: s_xor_b32 s2, s2, s3 -; GFX9-NEXT: s_sub_i32 s2, s2, s3 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: s_add_i32 s6, s3, 1 +; GFX9-NEXT: s_cmp_ge_u32 s2, s0 +; GFX9-NEXT: s_cselect_b32 s0, s6, s3 +; GFX9-NEXT: s_xor_b32 s0, s0, s1 +; GFX9-NEXT: s_sub_i32 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-NEXT: s_endpgm %r = sdiv i32 %x, %y store i32 %r, ptr addrspace(1) %out @@ -359,7 +359,7 @@ define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX6-LABEL: srem_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -394,34 +394,34 @@ define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX9-LABEL: srem_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_abs_i32 s3, s3 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX9-NEXT: s_sub_i32 s5, 0, s3 -; GFX9-NEXT: s_ashr_i32 s4, s2, 31 -; GFX9-NEXT: s_abs_i32 s2, s2 +; GFX9-NEXT: s_abs_i32 s0, s7 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX9-NEXT: s_ashr_i32 s1, s6, 31 +; GFX9-NEXT: s_abs_i32 s2, s6 +; GFX9-NEXT: s_sub_i32 s3, 0, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s6, v0 -; GFX9-NEXT: s_mul_i32 s5, s5, s6 -; GFX9-NEXT: s_mul_hi_u32 s5, s6, s5 -; GFX9-NEXT: s_add_i32 s6, s6, s5 -; GFX9-NEXT: s_mul_hi_u32 s5, s2, s6 -; GFX9-NEXT: s_mul_i32 s5, s5, s3 -; GFX9-NEXT: s_sub_i32 s2, s2, s5 -; GFX9-NEXT: s_sub_i32 s5, s2, s3 -; GFX9-NEXT: s_cmp_ge_u32 s2, s3 -; GFX9-NEXT: s_cselect_b32 s2, s5, s2 -; GFX9-NEXT: s_sub_i32 s5, s2, s3 -; GFX9-NEXT: s_cmp_ge_u32 s2, s3 -; GFX9-NEXT: s_cselect_b32 s2, s5, s2 -; GFX9-NEXT: s_xor_b32 s2, s2, s4 -; GFX9-NEXT: s_sub_i32 s2, s2, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: s_mul_i32 s3, s3, s6 +; GFX9-NEXT: s_mul_hi_u32 s3, s6, s3 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_mul_hi_u32 s3, s2, s6 +; GFX9-NEXT: s_mul_i32 s3, s3, s0 +; GFX9-NEXT: s_sub_i32 s2, s2, s3 +; GFX9-NEXT: s_sub_i32 s3, s2, s0 +; GFX9-NEXT: s_cmp_ge_u32 s2, s0 +; GFX9-NEXT: s_cselect_b32 s2, s3, s2 +; GFX9-NEXT: s_sub_i32 s3, s2, s0 +; GFX9-NEXT: s_cmp_ge_u32 s2, s0 +; GFX9-NEXT: s_cselect_b32 s0, s3, s2 +; GFX9-NEXT: s_xor_b32 s0, s0, s1 +; GFX9-NEXT: s_sub_i32 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-NEXT: s_endpgm %r = srem i32 %x, %y store i32 %r, ptr addrspace(1) %out @@ -452,15 +452,15 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; ; GFX6-LABEL: udiv_i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s0, s[2:3], 0xb +; GFX6-NEXT: s_load_dword s2, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshr_b32 s1, s0, 16 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s1 -; GFX6-NEXT: s_and_b32 s0, s0, 0xffff -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GFX6-NEXT: s_lshr_b32 s3, s2, 16 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX6-NEXT: s_and_b32 s2, s2, 0xffff +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s2 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 @@ -468,20 +468,19 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: udiv_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s1, s0, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 -; GFX9-NEXT: s_and_b32 s0, s0, 0xffff -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_lshr_b32 s3, s2, 16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-NEXT: s_and_b32 s2, s2, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2 @@ -489,7 +488,6 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v3, v0, s[0:1] ; GFX9-NEXT: s_endpgm %r = udiv i16 %x, %y @@ -523,37 +521,36 @@ define amdgpu_kernel void @urem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; ; GFX6-LABEL: urem_i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshr_b32 s5, s4, 16 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX6-NEXT: s_and_b32 s0, s4, 0xffff -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GFX6-NEXT: s_lshr_b32 s2, s4, 16 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX6-NEXT: s_and_b32 s3, s4, 0xffff +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s3 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2 ; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc -; GFX6-NEXT: v_mul_lo_u32 v0, v0, s5 +; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: urem_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s5, s4, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX9-NEXT: s_and_b32 s0, s4, 0xffff -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_lshr_b32 s3, s2, 16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-NEXT: s_and_b32 s4, s2, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s4 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2 @@ -562,8 +559,8 @@ define amdgpu_kernel void @urem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5 -; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 +; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm @@ -600,8 +597,8 @@ define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; ; GFX6-LABEL: sdiv_i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -626,27 +623,27 @@ define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; ; GFX9-LABEL: sdiv_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s2, s4, 16 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 -; GFX9-NEXT: s_sext_i32_i16 s3, s4 -; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s3 -; GFX9-NEXT: s_xor_b32 s2, s3, s2 +; GFX9-NEXT: s_ashr_i32 s0, s4, 16 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX9-NEXT: s_sext_i32_i16 s1, s4 +; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s1 +; GFX9-NEXT: s_xor_b32 s0, s1, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 -; GFX9-NEXT: s_or_b32 s4, s2, 1 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: s_or_b32 s4, s0, 1 ; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 ; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v2|, |v0| -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX9-NEXT: s_cselect_b32 s2, s4, 0 -; GFX9-NEXT: v_add_u32_e32 v0, s2, v3 -; GFX9-NEXT: global_store_short v1, v0, s[0:1] +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cselect_b32 s0, s4, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s0, v3 +; GFX9-NEXT: global_store_short v1, v0, s[2:3] ; GFX9-NEXT: s_endpgm %r = sdiv i16 %x, %y store i16 %r, ptr addrspace(1) %out @@ -683,8 +680,8 @@ define amdgpu_kernel void @srem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; ; GFX6-LABEL: srem_i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_ashr_i32 s5, s4, 16 ; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s5 @@ -711,8 +708,7 @@ define amdgpu_kernel void @srem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; ; GFX9-LABEL: srem_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s5, s4, 16 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s5 @@ -722,6 +718,7 @@ define amdgpu_kernel void @srem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GFX9-NEXT: s_ashr_i32 s2, s2, 30 ; GFX9-NEXT: s_or_b32 s6, s2, 1 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2 ; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 @@ -733,6 +730,7 @@ define amdgpu_kernel void @srem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm %r = srem i16 %x, %y @@ -764,8 +762,8 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out, i8 %x, i8 %y) { ; ; GFX6-LABEL: udiv_i8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -783,13 +781,13 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out, i8 %x, i8 %y) { ; ; GFX9-LABEL: udiv_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, s4 +; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, s4 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, s2 ; GFX9-NEXT: v_mul_f32_e32 v1, v3, v1 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v1 @@ -829,8 +827,8 @@ define amdgpu_kernel void @urem_i8(ptr addrspace(1) %out, i8 %x, i8 %y) { ; ; GFX6-LABEL: urem_i8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_ubyte1_e32 v0, s4 @@ -851,13 +849,13 @@ define amdgpu_kernel void @urem_i8(ptr addrspace(1) %out, i8 %x, i8 %y) { ; ; GFX9-LABEL: urem_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, s4 +; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, s4 -; GFX9-NEXT: s_lshr_b32 s2, s4, 8 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, s2 +; GFX9-NEXT: s_lshr_b32 s3, s2, 8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mul_f32_e32 v1, v2, v1 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v1 @@ -865,8 +863,9 @@ define amdgpu_kernel void @urem_i8(ptr addrspace(1) %out, i8 %x, i8 %y) { ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s2 -; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 +; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_byte v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm %r = urem i8 %x, %y @@ -902,8 +901,8 @@ define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out, i8 %x, i8 %y) { ; ; GFX6-LABEL: sdiv_i8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -928,27 +927,27 @@ define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out, i8 %x, i8 %y) { ; ; GFX9-LABEL: sdiv_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s2, s4, 0x80008 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 -; GFX9-NEXT: s_sext_i32_i8 s3, s4 -; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s3 -; GFX9-NEXT: s_xor_b32 s2, s3, s2 +; GFX9-NEXT: s_bfe_i32 s0, s4, 0x80008 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX9-NEXT: s_sext_i32_i8 s1, s4 +; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s1 +; GFX9-NEXT: s_xor_b32 s0, s1, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 -; GFX9-NEXT: s_or_b32 s4, s2, 1 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: s_or_b32 s4, s0, 1 ; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 ; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v2|, |v0| -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX9-NEXT: s_cselect_b32 s2, s4, 0 -; GFX9-NEXT: v_add_u32_e32 v0, s2, v3 -; GFX9-NEXT: global_store_byte v1, v0, s[0:1] +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cselect_b32 s0, s4, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s0, v3 +; GFX9-NEXT: global_store_byte v1, v0, s[2:3] ; GFX9-NEXT: s_endpgm %r = sdiv i8 %x, %y store i8 %r, ptr addrspace(1) %out @@ -985,8 +984,8 @@ define amdgpu_kernel void @srem_i8(ptr addrspace(1) %out, i8 %x, i8 %y) { ; ; GFX6-LABEL: srem_i8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_bfe_i32 s2, s4, 0x80008 ; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s2 @@ -1014,30 +1013,30 @@ define amdgpu_kernel void @srem_i8(ptr addrspace(1) %out, i8 %x, i8 %y) { ; ; GFX9-LABEL: srem_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s2, s4, 0x80008 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 -; GFX9-NEXT: s_sext_i32_i8 s3, s4 -; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s3 -; GFX9-NEXT: s_xor_b32 s2, s3, s2 +; GFX9-NEXT: s_bfe_i32 s0, s4, 0x80008 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX9-NEXT: s_sext_i32_i8 s1, s4 +; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s1 +; GFX9-NEXT: s_xor_b32 s0, s1, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 ; GFX9-NEXT: s_lshr_b32 s5, s4, 8 -; GFX9-NEXT: s_or_b32 s6, s2, 1 +; GFX9-NEXT: s_or_b32 s6, s0, 1 ; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2 ; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX9-NEXT: s_cselect_b32 s2, s6, 0 -; GFX9-NEXT: v_add_u32_e32 v0, s2, v2 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cselect_b32 s0, s6, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s0, v2 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 -; GFX9-NEXT: global_store_byte v1, v0, s[0:1] +; GFX9-NEXT: global_store_byte v1, v0, s[2:3] ; GFX9-NEXT: s_endpgm %r = srem i8 %x, %y store i8 %r, ptr addrspace(1) %out @@ -1179,13 +1178,13 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; ; GFX6-LABEL: udiv_v4i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx8 s[8:15], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s19, 0xf000 ; GFX6-NEXT: s_mov_b32 s18, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s12 -; GFX6-NEXT: s_sub_i32 s0, 0, s12 +; GFX6-NEXT: s_sub_i32 s2, 0, s12 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s13 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s14 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -1195,28 +1194,28 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v6 -; GFX6-NEXT: v_mul_lo_u32 v1, s0, v0 +; GFX6-NEXT: v_mul_lo_u32 v1, s2, v0 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: s_mul_i32 s0, s0, s12 -; GFX6-NEXT: s_sub_i32 s0, s8, s0 -; GFX6-NEXT: s_sub_i32 s1, s0, s12 -; GFX6-NEXT: s_cmp_ge_u32 s0, s12 +; GFX6-NEXT: v_readfirstlane_b32 s2, v0 +; GFX6-NEXT: s_mul_i32 s2, s2, s12 +; GFX6-NEXT: s_sub_i32 s2, s8, s2 +; GFX6-NEXT: s_sub_i32 s3, s2, s12 +; GFX6-NEXT: s_cmp_ge_u32 s2, s12 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; GFX6-NEXT: s_cselect_b32 s0, s1, s0 +; GFX6-NEXT: s_cselect_b32 s2, s3, s2 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX6-NEXT: s_cmp_ge_u32 s0, s12 -; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s2, s12 +; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX6-NEXT: s_sub_i32 s4, 0, s13 ; GFX6-NEXT: v_mul_lo_u32 v3, s4, v1 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 ; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[2:3] ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GFX6-NEXT: v_mul_hi_u32 v1, s9, v1 ; GFX6-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v4 @@ -1277,9 +1276,9 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; ; GFX9-LABEL: udiv_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 @@ -1499,36 +1498,34 @@ define amdgpu_kernel void @urem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; ; GFX6-LABEL: urem_v4i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 -; GFX6-NEXT: s_mov_b32 s15, 0xf000 -; GFX6-NEXT: s_mov_b32 s14, -1 +; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX6-NEXT: s_sub_i32 s0, 0, s8 +; GFX6-NEXT: s_sub_i32 s2, 0, s8 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s9 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s0, v0 +; GFX6-NEXT: v_mul_lo_u32 v1, s2, v0 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s10 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: s_mul_i32 s0, s0, s8 -; GFX6-NEXT: s_sub_i32 s0, s4, s0 -; GFX6-NEXT: s_sub_i32 s1, s0, s8 -; GFX6-NEXT: s_cmp_ge_u32 s0, s8 -; GFX6-NEXT: s_cselect_b32 s0, s1, s0 -; GFX6-NEXT: s_sub_i32 s1, s0, s8 -; GFX6-NEXT: s_cmp_ge_u32 s0, s8 -; GFX6-NEXT: s_cselect_b32 s0, s1, s0 -; GFX6-NEXT: s_sub_i32 s1, 0, s9 -; GFX6-NEXT: v_mul_lo_u32 v0, s1, v1 +; GFX6-NEXT: v_readfirstlane_b32 s2, v0 +; GFX6-NEXT: s_mul_i32 s2, s2, s8 +; GFX6-NEXT: s_sub_i32 s2, s4, s2 +; GFX6-NEXT: s_sub_i32 s3, s2, s8 +; GFX6-NEXT: s_cmp_ge_u32 s2, s8 +; GFX6-NEXT: s_cselect_b32 s2, s3, s2 +; GFX6-NEXT: s_sub_i32 s3, s2, s8 +; GFX6-NEXT: s_cmp_ge_u32 s2, s8 +; GFX6-NEXT: s_cselect_b32 s4, s3, s2 +; GFX6-NEXT: s_sub_i32 s2, 0, s9 +; GFX6-NEXT: v_mul_lo_u32 v0, s2, v1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 @@ -1536,58 +1533,60 @@ define amdgpu_kernel void @urem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s11 -; GFX6-NEXT: v_readfirstlane_b32 s1, v0 -; GFX6-NEXT: s_mul_i32 s1, s1, s9 -; GFX6-NEXT: s_sub_i32 s1, s5, s1 -; GFX6-NEXT: s_sub_i32 s4, s1, s9 -; GFX6-NEXT: s_cmp_ge_u32 s1, s9 -; GFX6-NEXT: s_cselect_b32 s1, s4, s1 -; GFX6-NEXT: s_sub_i32 s4, s1, s9 -; GFX6-NEXT: s_cmp_ge_u32 s1, s9 -; GFX6-NEXT: s_cselect_b32 s1, s4, s1 -; GFX6-NEXT: s_sub_i32 s4, 0, s10 -; GFX6-NEXT: v_mul_lo_u32 v0, s4, v1 +; GFX6-NEXT: v_readfirstlane_b32 s2, v0 +; GFX6-NEXT: s_mul_i32 s2, s2, s9 +; GFX6-NEXT: s_sub_i32 s2, s5, s2 +; GFX6-NEXT: s_sub_i32 s3, s2, s9 +; GFX6-NEXT: s_cmp_ge_u32 s2, s9 +; GFX6-NEXT: s_cselect_b32 s2, s3, s2 +; GFX6-NEXT: s_sub_i32 s3, s2, s9 +; GFX6-NEXT: s_cmp_ge_u32 s2, s9 +; GFX6-NEXT: s_cselect_b32 s5, s3, s2 +; GFX6-NEXT: s_sub_i32 s2, 0, s10 +; GFX6-NEXT: v_mul_lo_u32 v0, s2, v1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_readfirstlane_b32 s4, v0 -; GFX6-NEXT: s_mul_i32 s4, s4, s10 -; GFX6-NEXT: s_sub_i32 s4, s6, s4 -; GFX6-NEXT: s_sub_i32 s5, s4, s10 -; GFX6-NEXT: s_cmp_ge_u32 s4, s10 -; GFX6-NEXT: s_cselect_b32 s4, s5, s4 -; GFX6-NEXT: s_sub_i32 s5, s4, s10 -; GFX6-NEXT: s_cmp_ge_u32 s4, s10 -; GFX6-NEXT: s_cselect_b32 s4, s5, s4 -; GFX6-NEXT: s_sub_i32 s5, 0, s11 -; GFX6-NEXT: v_mul_lo_u32 v0, s5, v1 +; GFX6-NEXT: v_readfirstlane_b32 s2, v0 +; GFX6-NEXT: s_mul_i32 s2, s2, s10 +; GFX6-NEXT: s_sub_i32 s2, s6, s2 +; GFX6-NEXT: s_sub_i32 s3, s2, s10 +; GFX6-NEXT: s_cmp_ge_u32 s2, s10 +; GFX6-NEXT: s_cselect_b32 s2, s3, s2 +; GFX6-NEXT: s_sub_i32 s3, s2, s10 +; GFX6-NEXT: s_cmp_ge_u32 s2, s10 +; GFX6-NEXT: s_cselect_b32 s6, s3, s2 +; GFX6-NEXT: s_sub_i32 s2, 0, s11 +; GFX6-NEXT: v_mul_lo_u32 v0, s2, v1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GFX6-NEXT: v_mul_hi_u32 v2, s7, v0 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: v_readfirstlane_b32 s0, v2 -; GFX6-NEXT: s_mul_i32 s0, s0, s11 -; GFX6-NEXT: s_sub_i32 s0, s7, s0 -; GFX6-NEXT: s_sub_i32 s1, s0, s11 -; GFX6-NEXT: s_cmp_ge_u32 s0, s11 -; GFX6-NEXT: s_cselect_b32 s0, s1, s0 -; GFX6-NEXT: s_sub_i32 s1, s0, s11 -; GFX6-NEXT: s_cmp_ge_u32 s0, s11 -; GFX6-NEXT: s_cselect_b32 s0, s1, s0 -; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: v_mov_b32_e32 v3, s0 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_readfirstlane_b32 s4, v2 +; GFX6-NEXT: s_mul_i32 s4, s4, s11 +; GFX6-NEXT: s_sub_i32 s4, s7, s4 +; GFX6-NEXT: s_sub_i32 s5, s4, s11 +; GFX6-NEXT: s_cmp_ge_u32 s4, s11 +; GFX6-NEXT: s_cselect_b32 s4, s5, s4 +; GFX6-NEXT: s_sub_i32 s5, s4, s11 +; GFX6-NEXT: s_cmp_ge_u32 s4, s11 +; GFX6-NEXT: s_cselect_b32 s4, s5, s4 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v3, s4 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: urem_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 @@ -1843,34 +1842,34 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; ; GFX6-LABEL: sdiv_v4i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx8 s[8:15], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s19, 0xf000 ; GFX6-NEXT: s_mov_b32 s18, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_abs_i32 s0, s12 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GFX6-NEXT: s_sub_i32 s1, 0, s0 +; GFX6-NEXT: s_abs_i32 s2, s12 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX6-NEXT: s_sub_i32 s3, 0, s2 ; GFX6-NEXT: s_xor_b32 s4, s8, s12 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX6-NEXT: s_abs_i32 s1, s8 +; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0 +; GFX6-NEXT: s_abs_i32 s3, s8 ; GFX6-NEXT: s_ashr_i32 s8, s4, 31 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 ; GFX6-NEXT: v_readfirstlane_b32 s4, v0 -; GFX6-NEXT: s_mul_i32 s4, s4, s0 -; GFX6-NEXT: s_sub_i32 s1, s1, s4 -; GFX6-NEXT: s_sub_i32 s4, s1, s0 -; GFX6-NEXT: s_cmp_ge_u32 s1, s0 +; GFX6-NEXT: s_mul_i32 s4, s4, s2 +; GFX6-NEXT: s_sub_i32 s3, s3, s4 +; GFX6-NEXT: s_sub_i32 s4, s3, s2 +; GFX6-NEXT: s_cmp_ge_u32 s3, s2 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 -; GFX6-NEXT: s_cselect_b32 s1, s4, s1 +; GFX6-NEXT: s_cselect_b32 s3, s4, s3 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX6-NEXT: s_cmp_ge_u32 s1, s0 -; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s3, s2 +; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX6-NEXT: s_abs_i32 s4, s13 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s4 ; GFX6-NEXT: s_sub_i32 s5, 0, s4 @@ -1878,7 +1877,7 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX6-NEXT: s_xor_b32 s6, s9, s13 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 -; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1] +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] ; GFX6-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX6-NEXT: v_xor_b32_e32 v0, s8, v0 @@ -1965,16 +1964,17 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; ; GFX9-LABEL: sdiv_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_abs_i32 s0, s8 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GFX9-NEXT: s_xor_b32 s1, s4, s8 -; GFX9-NEXT: s_sub_i32 s8, 0, s0 +; GFX9-NEXT: s_abs_i32 s2, s8 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX9-NEXT: s_xor_b32 s3, s4, s8 +; GFX9-NEXT: s_sub_i32 s8, 0, s2 ; GFX9-NEXT: s_abs_i32 s4, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_ashr_i32 s1, s1, 31 +; GFX9-NEXT: s_ashr_i32 s3, s3, 31 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s12, v0 @@ -1982,82 +1982,81 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX9-NEXT: s_mul_hi_u32 s8, s12, s8 ; GFX9-NEXT: s_add_i32 s12, s12, s8 ; GFX9-NEXT: s_mul_hi_u32 s8, s4, s12 -; GFX9-NEXT: s_mul_i32 s12, s8, s0 +; GFX9-NEXT: s_mul_i32 s12, s8, s2 ; GFX9-NEXT: s_sub_i32 s4, s4, s12 ; GFX9-NEXT: s_add_i32 s13, s8, 1 -; GFX9-NEXT: s_sub_i32 s12, s4, s0 -; GFX9-NEXT: s_cmp_ge_u32 s4, s0 +; GFX9-NEXT: s_sub_i32 s12, s4, s2 +; GFX9-NEXT: s_cmp_ge_u32 s4, s2 ; GFX9-NEXT: s_cselect_b32 s8, s13, s8 ; GFX9-NEXT: s_cselect_b32 s4, s12, s4 ; GFX9-NEXT: s_add_i32 s12, s8, 1 -; GFX9-NEXT: s_cmp_ge_u32 s4, s0 -; GFX9-NEXT: s_cselect_b32 s0, s12, s8 +; GFX9-NEXT: s_cmp_ge_u32 s4, s2 +; GFX9-NEXT: s_cselect_b32 s2, s12, s8 ; GFX9-NEXT: s_abs_i32 s4, s9 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX9-NEXT: s_xor_b32 s0, s0, s1 +; GFX9-NEXT: s_xor_b32 s2, s2, s3 ; GFX9-NEXT: s_xor_b32 s8, s5, s9 ; GFX9-NEXT: s_sub_i32 s9, 0, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_i32 s12, s0, s1 +; GFX9-NEXT: s_sub_i32 s2, s2, s3 ; GFX9-NEXT: s_abs_i32 s5, s5 ; GFX9-NEXT: s_ashr_i32 s8, s8, 31 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: s_mul_i32 s9, s9, s0 -; GFX9-NEXT: s_mul_hi_u32 s1, s0, s9 -; GFX9-NEXT: s_add_i32 s0, s0, s1 -; GFX9-NEXT: s_mul_hi_u32 s0, s5, s0 -; GFX9-NEXT: s_mul_i32 s1, s0, s4 -; GFX9-NEXT: s_sub_i32 s1, s5, s1 -; GFX9-NEXT: s_add_i32 s9, s0, 1 -; GFX9-NEXT: s_sub_i32 s5, s1, s4 -; GFX9-NEXT: s_cmp_ge_u32 s1, s4 -; GFX9-NEXT: s_cselect_b32 s0, s9, s0 -; GFX9-NEXT: s_cselect_b32 s1, s5, s1 -; GFX9-NEXT: s_add_i32 s5, s0, 1 -; GFX9-NEXT: s_cmp_ge_u32 s1, s4 -; GFX9-NEXT: s_cselect_b32 s0, s5, s0 -; GFX9-NEXT: s_abs_i32 s1, s10 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 -; GFX9-NEXT: s_xor_b32 s0, s0, s8 -; GFX9-NEXT: s_xor_b32 s4, s6, s10 -; GFX9-NEXT: s_abs_i32 s5, s6 +; GFX9-NEXT: v_readfirstlane_b32 s3, v0 +; GFX9-NEXT: s_mul_i32 s9, s9, s3 +; GFX9-NEXT: s_mul_hi_u32 s9, s3, s9 +; GFX9-NEXT: s_add_i32 s3, s3, s9 +; GFX9-NEXT: s_mul_hi_u32 s3, s5, s3 +; GFX9-NEXT: s_mul_i32 s9, s3, s4 +; GFX9-NEXT: s_sub_i32 s5, s5, s9 +; GFX9-NEXT: s_add_i32 s12, s3, 1 +; GFX9-NEXT: s_sub_i32 s9, s5, s4 +; GFX9-NEXT: s_cmp_ge_u32 s5, s4 +; GFX9-NEXT: s_cselect_b32 s3, s12, s3 +; GFX9-NEXT: s_cselect_b32 s5, s9, s5 +; GFX9-NEXT: s_add_i32 s9, s3, 1 +; GFX9-NEXT: s_cmp_ge_u32 s5, s4 +; GFX9-NEXT: s_cselect_b32 s3, s9, s3 +; GFX9-NEXT: s_abs_i32 s4, s10 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX9-NEXT: s_xor_b32 s3, s3, s8 +; GFX9-NEXT: s_sub_i32 s9, 0, s4 +; GFX9-NEXT: s_sub_i32 s3, s3, s8 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_i32 s6, 0, s1 -; GFX9-NEXT: s_sub_i32 s8, s0, s8 -; GFX9-NEXT: s_ashr_i32 s4, s4, 31 +; GFX9-NEXT: s_xor_b32 s5, s6, s10 +; GFX9-NEXT: s_abs_i32 s6, s6 +; GFX9-NEXT: s_ashr_i32 s5, s5, 31 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: s_mul_i32 s6, s6, s0 -; GFX9-NEXT: s_mul_hi_u32 s6, s0, s6 -; GFX9-NEXT: s_add_i32 s0, s0, s6 -; GFX9-NEXT: s_mul_hi_u32 s0, s5, s0 -; GFX9-NEXT: s_mul_i32 s6, s0, s1 -; GFX9-NEXT: s_sub_i32 s5, s5, s6 -; GFX9-NEXT: s_add_i32 s9, s0, 1 -; GFX9-NEXT: s_sub_i32 s6, s5, s1 -; GFX9-NEXT: s_cmp_ge_u32 s5, s1 -; GFX9-NEXT: s_cselect_b32 s0, s9, s0 -; GFX9-NEXT: s_cselect_b32 s5, s6, s5 -; GFX9-NEXT: s_add_i32 s6, s0, 1 -; GFX9-NEXT: s_cmp_ge_u32 s5, s1 -; GFX9-NEXT: s_cselect_b32 s5, s6, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_readfirstlane_b32 s8, v0 +; GFX9-NEXT: s_mul_i32 s9, s9, s8 +; GFX9-NEXT: s_mul_hi_u32 s9, s8, s9 +; GFX9-NEXT: s_add_i32 s8, s8, s9 +; GFX9-NEXT: s_mul_hi_u32 s8, s6, s8 +; GFX9-NEXT: s_mul_i32 s9, s8, s4 +; GFX9-NEXT: s_sub_i32 s6, s6, s9 +; GFX9-NEXT: s_add_i32 s10, s8, 1 +; GFX9-NEXT: s_sub_i32 s9, s6, s4 +; GFX9-NEXT: s_cmp_ge_u32 s6, s4 +; GFX9-NEXT: s_cselect_b32 s8, s10, s8 +; GFX9-NEXT: s_cselect_b32 s6, s9, s6 +; GFX9-NEXT: s_add_i32 s9, s8, 1 +; GFX9-NEXT: s_cmp_ge_u32 s6, s4 +; GFX9-NEXT: s_cselect_b32 s4, s9, s8 ; GFX9-NEXT: s_abs_i32 s6, s11 ; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s6 -; GFX9-NEXT: s_xor_b32 s5, s5, s4 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_xor_b32 s4, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: s_xor_b32 s2, s7, s11 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX9-NEXT: s_abs_i32 s3, s7 ; GFX9-NEXT: s_sub_i32 s7, 0, s6 -; GFX9-NEXT: s_sub_i32 s4, s5, s4 +; GFX9-NEXT: s_sub_i32 s4, s4, s5 ; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX9-NEXT: s_ashr_i32 s2, s2, 31 -; GFX9-NEXT: v_mov_b32_e32 v0, s12 ; GFX9-NEXT: v_readfirstlane_b32 s5, v2 ; GFX9-NEXT: s_mul_i32 s7, s7, s5 ; GFX9-NEXT: s_mul_hi_u32 s7, s5, s7 @@ -2077,7 +2076,6 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX9-NEXT: s_sub_i32 s2, s3, s2 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm %r = sdiv <4 x i32> %x, %y @@ -2244,34 +2242,35 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; ; GFX6-LABEL: srem_v4i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_abs_i32 s0, s8 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GFX6-NEXT: s_sub_i32 s1, 0, s0 +; GFX6-NEXT: s_abs_i32 s2, s8 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX6-NEXT: s_sub_i32 s3, 0, s2 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX6-NEXT: s_abs_i32 s1, s4 +; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0 +; GFX6-NEXT: s_abs_i32 s3, s4 ; GFX6-NEXT: s_ashr_i32 s4, s4, 31 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 -; GFX6-NEXT: s_mul_i32 s8, s8, s0 -; GFX6-NEXT: s_sub_i32 s1, s1, s8 -; GFX6-NEXT: s_sub_i32 s8, s1, s0 -; GFX6-NEXT: s_cmp_ge_u32 s1, s0 -; GFX6-NEXT: s_cselect_b32 s1, s8, s1 -; GFX6-NEXT: s_sub_i32 s8, s1, s0 -; GFX6-NEXT: s_cmp_ge_u32 s1, s0 -; GFX6-NEXT: s_cselect_b32 s0, s8, s1 -; GFX6-NEXT: s_abs_i32 s1, s9 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s1 -; GFX6-NEXT: s_sub_i32 s8, 0, s1 -; GFX6-NEXT: s_xor_b32 s0, s0, s4 -; GFX6-NEXT: s_sub_i32 s0, s0, s4 +; GFX6-NEXT: s_mul_i32 s8, s8, s2 +; GFX6-NEXT: s_sub_i32 s3, s3, s8 +; GFX6-NEXT: s_sub_i32 s8, s3, s2 +; GFX6-NEXT: s_cmp_ge_u32 s3, s2 +; GFX6-NEXT: s_cselect_b32 s3, s8, s3 +; GFX6-NEXT: s_sub_i32 s8, s3, s2 +; GFX6-NEXT: s_cmp_ge_u32 s3, s2 +; GFX6-NEXT: s_cselect_b32 s2, s8, s3 +; GFX6-NEXT: s_abs_i32 s3, s9 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX6-NEXT: s_sub_i32 s8, 0, s3 +; GFX6-NEXT: s_xor_b32 s2, s2, s4 +; GFX6-NEXT: s_sub_i32 s4, s2, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -2281,22 +2280,21 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 -; GFX6-NEXT: v_readfirstlane_b32 s4, v0 -; GFX6-NEXT: s_mul_i32 s4, s4, s1 -; GFX6-NEXT: s_sub_i32 s4, s8, s4 -; GFX6-NEXT: s_sub_i32 s8, s4, s1 -; GFX6-NEXT: s_cmp_ge_u32 s4, s1 -; GFX6-NEXT: s_cselect_b32 s4, s8, s4 -; GFX6-NEXT: s_sub_i32 s8, s4, s1 -; GFX6-NEXT: s_cmp_ge_u32 s4, s1 -; GFX6-NEXT: s_cselect_b32 s1, s8, s4 -; GFX6-NEXT: s_abs_i32 s4, s10 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX6-NEXT: s_sub_i32 s8, 0, s4 -; GFX6-NEXT: s_xor_b32 s1, s1, s5 -; GFX6-NEXT: s_sub_i32 s1, s1, s5 +; GFX6-NEXT: v_readfirstlane_b32 s2, v0 +; GFX6-NEXT: s_mul_i32 s2, s2, s3 +; GFX6-NEXT: s_sub_i32 s2, s8, s2 +; GFX6-NEXT: s_sub_i32 s8, s2, s3 +; GFX6-NEXT: s_cmp_ge_u32 s2, s3 +; GFX6-NEXT: s_cselect_b32 s2, s8, s2 +; GFX6-NEXT: s_sub_i32 s8, s2, s3 +; GFX6-NEXT: s_cmp_ge_u32 s2, s3 +; GFX6-NEXT: s_cselect_b32 s2, s8, s2 +; GFX6-NEXT: s_abs_i32 s3, s10 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX6-NEXT: s_sub_i32 s8, 0, s3 +; GFX6-NEXT: s_xor_b32 s2, s2, s5 +; GFX6-NEXT: s_sub_i32 s5, s2, s5 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX6-NEXT: s_mov_b32 s10, -1 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_lo_u32 v1, s8, v0 @@ -2305,59 +2303,59 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 -; GFX6-NEXT: v_readfirstlane_b32 s5, v0 -; GFX6-NEXT: s_mul_i32 s5, s5, s4 -; GFX6-NEXT: s_sub_i32 s5, s8, s5 -; GFX6-NEXT: s_sub_i32 s8, s5, s4 -; GFX6-NEXT: s_cmp_ge_u32 s5, s4 -; GFX6-NEXT: s_cselect_b32 s5, s8, s5 -; GFX6-NEXT: s_sub_i32 s8, s5, s4 -; GFX6-NEXT: s_cmp_ge_u32 s5, s4 -; GFX6-NEXT: s_cselect_b32 s4, s8, s5 -; GFX6-NEXT: s_abs_i32 s5, s11 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX6-NEXT: s_sub_i32 s8, 0, s5 -; GFX6-NEXT: s_mov_b32 s11, 0xf000 +; GFX6-NEXT: v_readfirstlane_b32 s2, v0 +; GFX6-NEXT: s_mul_i32 s2, s2, s3 +; GFX6-NEXT: s_sub_i32 s2, s8, s2 +; GFX6-NEXT: s_sub_i32 s8, s2, s3 +; GFX6-NEXT: s_cmp_ge_u32 s2, s3 +; GFX6-NEXT: s_cselect_b32 s2, s8, s2 +; GFX6-NEXT: s_sub_i32 s8, s2, s3 +; GFX6-NEXT: s_cmp_ge_u32 s2, s3 +; GFX6-NEXT: s_cselect_b32 s8, s8, s2 +; GFX6-NEXT: s_abs_i32 s9, s11 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s9 +; GFX6-NEXT: s_sub_i32 s2, 0, s9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v0 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: s_abs_i32 s0, s7 -; GFX6-NEXT: v_mul_lo_u32 v1, s8, v2 -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 -; GFX6-NEXT: s_xor_b32 s2, s4, s6 -; GFX6-NEXT: s_sub_i32 s2, s2, s6 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_abs_i32 s4, s7 +; GFX6-NEXT: v_mul_lo_u32 v1, s2, v2 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mul_hi_u32 v3, v2, v1 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: s_ashr_i32 s1, s7, 31 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_ashr_i32 s5, s7, 31 +; GFX6-NEXT: s_xor_b32 s7, s8, s6 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_mul_hi_u32 v2, s0, v2 -; GFX6-NEXT: v_readfirstlane_b32 s3, v2 -; GFX6-NEXT: s_mul_i32 s3, s3, s5 -; GFX6-NEXT: s_sub_i32 s0, s0, s3 -; GFX6-NEXT: s_sub_i32 s3, s0, s5 -; GFX6-NEXT: s_cmp_ge_u32 s0, s5 -; GFX6-NEXT: s_cselect_b32 s0, s3, s0 -; GFX6-NEXT: s_sub_i32 s3, s0, s5 -; GFX6-NEXT: s_cmp_ge_u32 s0, s5 -; GFX6-NEXT: s_cselect_b32 s0, s3, s0 -; GFX6-NEXT: s_xor_b32 s0, s0, s1 -; GFX6-NEXT: s_sub_i32 s0, s0, s1 -; GFX6-NEXT: v_mov_b32_e32 v2, s2 -; GFX6-NEXT: v_mov_b32_e32 v3, s0 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; GFX6-NEXT: v_mul_hi_u32 v2, s4, v2 +; GFX6-NEXT: s_sub_i32 s6, s7, s6 +; GFX6-NEXT: v_readfirstlane_b32 s7, v2 +; GFX6-NEXT: s_mul_i32 s7, s7, s9 +; GFX6-NEXT: s_sub_i32 s4, s4, s7 +; GFX6-NEXT: s_sub_i32 s7, s4, s9 +; GFX6-NEXT: s_cmp_ge_u32 s4, s9 +; GFX6-NEXT: s_cselect_b32 s4, s7, s4 +; GFX6-NEXT: s_sub_i32 s7, s4, s9 +; GFX6-NEXT: s_cmp_ge_u32 s4, s9 +; GFX6-NEXT: s_cselect_b32 s4, s7, s4 +; GFX6-NEXT: s_xor_b32 s4, s4, s5 +; GFX6-NEXT: s_sub_i32 s4, s4, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v3, s4 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: srem_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_abs_i32 s0, s8 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GFX9-NEXT: s_sub_i32 s8, 0, s0 -; GFX9-NEXT: s_ashr_i32 s1, s4, 31 +; GFX9-NEXT: s_abs_i32 s2, s8 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX9-NEXT: s_sub_i32 s8, 0, s2 +; GFX9-NEXT: s_ashr_i32 s3, s4, 31 ; GFX9-NEXT: s_abs_i32 s4, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -2367,73 +2365,72 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX9-NEXT: s_mul_hi_u32 s8, s12, s8 ; GFX9-NEXT: s_add_i32 s12, s12, s8 ; GFX9-NEXT: s_mul_hi_u32 s8, s4, s12 -; GFX9-NEXT: s_mul_i32 s8, s8, s0 +; GFX9-NEXT: s_mul_i32 s8, s8, s2 ; GFX9-NEXT: s_sub_i32 s4, s4, s8 -; GFX9-NEXT: s_sub_i32 s8, s4, s0 -; GFX9-NEXT: s_cmp_ge_u32 s4, s0 +; GFX9-NEXT: s_sub_i32 s8, s4, s2 +; GFX9-NEXT: s_cmp_ge_u32 s4, s2 ; GFX9-NEXT: s_cselect_b32 s4, s8, s4 -; GFX9-NEXT: s_sub_i32 s8, s4, s0 -; GFX9-NEXT: s_cmp_ge_u32 s4, s0 -; GFX9-NEXT: s_cselect_b32 s0, s8, s4 +; GFX9-NEXT: s_sub_i32 s8, s4, s2 +; GFX9-NEXT: s_cmp_ge_u32 s4, s2 +; GFX9-NEXT: s_cselect_b32 s2, s8, s4 ; GFX9-NEXT: s_abs_i32 s4, s9 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX9-NEXT: s_xor_b32 s0, s0, s1 +; GFX9-NEXT: s_xor_b32 s2, s2, s3 ; GFX9-NEXT: s_sub_i32 s9, 0, s4 -; GFX9-NEXT: s_sub_i32 s12, s0, s1 +; GFX9-NEXT: s_sub_i32 s2, s2, s3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_ashr_i32 s8, s5, 31 ; GFX9-NEXT: s_abs_i32 s5, s5 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: s_mul_i32 s9, s9, s0 -; GFX9-NEXT: s_mul_hi_u32 s1, s0, s9 -; GFX9-NEXT: s_add_i32 s0, s0, s1 -; GFX9-NEXT: s_mul_hi_u32 s0, s5, s0 -; GFX9-NEXT: s_mul_i32 s0, s0, s4 -; GFX9-NEXT: s_sub_i32 s0, s5, s0 -; GFX9-NEXT: s_sub_i32 s1, s0, s4 -; GFX9-NEXT: s_cmp_ge_u32 s0, s4 -; GFX9-NEXT: s_cselect_b32 s0, s1, s0 -; GFX9-NEXT: s_sub_i32 s1, s0, s4 -; GFX9-NEXT: s_cmp_ge_u32 s0, s4 -; GFX9-NEXT: s_cselect_b32 s0, s1, s0 -; GFX9-NEXT: s_abs_i32 s1, s10 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 -; GFX9-NEXT: s_xor_b32 s0, s0, s8 -; GFX9-NEXT: s_ashr_i32 s4, s6, 31 -; GFX9-NEXT: s_abs_i32 s5, s6 +; GFX9-NEXT: v_readfirstlane_b32 s3, v0 +; GFX9-NEXT: s_mul_i32 s9, s9, s3 +; GFX9-NEXT: s_mul_hi_u32 s9, s3, s9 +; GFX9-NEXT: s_add_i32 s3, s3, s9 +; GFX9-NEXT: s_mul_hi_u32 s3, s5, s3 +; GFX9-NEXT: s_mul_i32 s3, s3, s4 +; GFX9-NEXT: s_sub_i32 s3, s5, s3 +; GFX9-NEXT: s_sub_i32 s5, s3, s4 +; GFX9-NEXT: s_cmp_ge_u32 s3, s4 +; GFX9-NEXT: s_cselect_b32 s3, s5, s3 +; GFX9-NEXT: s_sub_i32 s5, s3, s4 +; GFX9-NEXT: s_cmp_ge_u32 s3, s4 +; GFX9-NEXT: s_cselect_b32 s3, s5, s3 +; GFX9-NEXT: s_abs_i32 s4, s10 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX9-NEXT: s_xor_b32 s3, s3, s8 +; GFX9-NEXT: s_sub_i32 s9, 0, s4 +; GFX9-NEXT: s_sub_i32 s3, s3, s8 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_i32 s6, 0, s1 -; GFX9-NEXT: s_sub_i32 s8, s0, s8 +; GFX9-NEXT: s_ashr_i32 s5, s6, 31 +; GFX9-NEXT: s_abs_i32 s6, s6 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: s_mul_i32 s6, s6, s0 -; GFX9-NEXT: s_mul_hi_u32 s6, s0, s6 -; GFX9-NEXT: s_add_i32 s0, s0, s6 -; GFX9-NEXT: s_mul_hi_u32 s0, s5, s0 -; GFX9-NEXT: s_mul_i32 s0, s0, s1 -; GFX9-NEXT: s_sub_i32 s0, s5, s0 -; GFX9-NEXT: s_sub_i32 s5, s0, s1 -; GFX9-NEXT: s_cmp_ge_u32 s0, s1 -; GFX9-NEXT: s_cselect_b32 s0, s5, s0 -; GFX9-NEXT: s_sub_i32 s5, s0, s1 -; GFX9-NEXT: s_cmp_ge_u32 s0, s1 -; GFX9-NEXT: s_cselect_b32 s5, s5, s0 +; GFX9-NEXT: v_readfirstlane_b32 s8, v0 +; GFX9-NEXT: s_mul_i32 s9, s9, s8 +; GFX9-NEXT: s_mul_hi_u32 s9, s8, s9 +; GFX9-NEXT: s_add_i32 s8, s8, s9 +; GFX9-NEXT: s_mul_hi_u32 s8, s6, s8 +; GFX9-NEXT: s_mul_i32 s8, s8, s4 +; GFX9-NEXT: s_sub_i32 s6, s6, s8 +; GFX9-NEXT: s_sub_i32 s8, s6, s4 +; GFX9-NEXT: s_cmp_ge_u32 s6, s4 +; GFX9-NEXT: s_cselect_b32 s6, s8, s6 +; GFX9-NEXT: s_sub_i32 s8, s6, s4 +; GFX9-NEXT: s_cmp_ge_u32 s6, s4 +; GFX9-NEXT: s_cselect_b32 s4, s8, s6 ; GFX9-NEXT: s_abs_i32 s6, s11 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 -; GFX9-NEXT: s_xor_b32 s5, s5, s4 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_xor_b32 s4, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: s_ashr_i32 s2, s7, 31 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: s_abs_i32 s3, s7 ; GFX9-NEXT: s_sub_i32 s7, 0, s6 -; GFX9-NEXT: s_sub_i32 s4, s5, s4 ; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: v_mov_b32_e32 v0, s12 -; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: s_sub_i32 s4, s4, s5 ; GFX9-NEXT: v_readfirstlane_b32 s5, v2 ; GFX9-NEXT: s_mul_i32 s7, s7, s5 ; GFX9-NEXT: s_mul_hi_u32 s7, s5, s7 @@ -2451,7 +2448,6 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX9-NEXT: s_sub_i32 s2, s3, s2 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm %r = srem <4 x i32> %x, %y @@ -2546,8 +2542,8 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX6-LABEL: udiv_v4i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -2606,21 +2602,21 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX9-LABEL: udiv_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s1, s6, 0xffff -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 -; GFX9-NEXT: s_and_b32 s0, s4, 0xffff +; GFX9-NEXT: s_and_b32 s3, s6, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-NEXT: s_and_b32 s2, s4, 0xffff ; GFX9-NEXT: s_lshr_b32 s6, s6, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s0 +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 ; GFX9-NEXT: s_lshr_b32 s4, s4, 16 ; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s4 ; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 ; GFX9-NEXT: s_and_b32 s2, s7, 0xffff ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v4 @@ -2658,7 +2654,6 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %r = udiv <4 x i16> %x, %y @@ -2761,8 +2756,8 @@ define amdgpu_kernel void @urem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX6-LABEL: urem_v4i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -2829,34 +2824,35 @@ define amdgpu_kernel void @urem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX9-LABEL: urem_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s9, s6, 0xffff -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GFX9-NEXT: s_and_b32 s8, s4, 0xffff +; GFX9-NEXT: s_and_b32 s3, s6, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-NEXT: s_and_b32 s2, s4, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s2 ; GFX9-NEXT: s_lshr_b32 s6, s6, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s8 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 ; GFX9-NEXT: s_lshr_b32 s4, s4, 16 ; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s4 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 ; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 -; GFX9-NEXT: s_and_b32 s2, s7, 0xffff ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v4 ; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s2 -; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 -; GFX9-NEXT: v_trunc_f32_e32 v2, v5 -; GFX9-NEXT: s_and_b32 s3, s5, 0xffff +; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v7, vcc +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 +; GFX9-NEXT: s_and_b32 s3, s7, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s3 +; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5 +; GFX9-NEXT: v_trunc_f32_e32 v2, v5 +; GFX9-NEXT: s_and_b32 s8, s5, 0xffff ; GFX9-NEXT: v_mad_f32 v3, -v2, v1, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s8 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v4 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc @@ -2871,21 +2867,20 @@ define amdgpu_kernel void @urem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v5 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s9 +; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc ; GFX9-NEXT: v_mul_f32_e32 v3, v7, v8 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v3 ; GFX9-NEXT: v_mad_f32 v3, -v3, v5, v7 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 -; GFX9-NEXT: v_mul_lo_u32 v2, v2, s2 +; GFX9-NEXT: v_mul_lo_u32 v2, v2, s3 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc ; GFX9-NEXT: v_mul_lo_u32 v3, v3, s6 -; GFX9-NEXT: v_sub_u32_e32 v0, s8, v0 ; GFX9-NEXT: v_sub_u32_e32 v4, s4, v1 -; GFX9-NEXT: v_sub_u32_e32 v1, s3, v2 -; GFX9-NEXT: v_sub_u32_e32 v2, s5, v3 +; GFX9-NEXT: v_sub_u32_e32 v1, s8, v2 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_sub_u32_e32 v2, s5, v3 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v0 @@ -2999,8 +2994,8 @@ define amdgpu_kernel void @sdiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX6-LABEL: sdiv_v4i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -3079,79 +3074,79 @@ define amdgpu_kernel void @sdiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX9-LABEL: sdiv_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sext_i32_i16 s2, s6 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 -; GFX9-NEXT: s_sext_i32_i16 s3, s4 -; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s3 -; GFX9-NEXT: s_xor_b32 s2, s3, s2 +; GFX9-NEXT: s_sext_i32_i16 s0, s6 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX9-NEXT: s_sext_i32_i16 s1, s4 +; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s1 +; GFX9-NEXT: s_xor_b32 s0, s1, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 -; GFX9-NEXT: s_or_b32 s8, s2, 1 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: s_or_b32 s8, s0, 1 ; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 ; GFX9-NEXT: v_mad_f32 v1, -v3, v0, v1 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX9-NEXT: s_cselect_b32 s2, s8, 0 -; GFX9-NEXT: s_ashr_i32 s3, s6, 16 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s3 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cselect_b32 s0, s8, 0 +; GFX9-NEXT: s_ashr_i32 s1, s6, 16 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s1 ; GFX9-NEXT: s_ashr_i32 s4, s4, 16 ; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s4 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 -; GFX9-NEXT: v_add_u32_e32 v3, s2, v3 +; GFX9-NEXT: v_add_u32_e32 v3, s0, v3 ; GFX9-NEXT: v_mul_f32_e32 v4, v1, v4 -; GFX9-NEXT: s_xor_b32 s2, s4, s3 +; GFX9-NEXT: s_xor_b32 s0, s4, s1 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 ; GFX9-NEXT: v_mad_f32 v1, -v4, v0, v1 -; GFX9-NEXT: s_or_b32 s4, s2, 1 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_or_b32 s4, s0, 1 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GFX9-NEXT: s_sext_i32_i16 s3, s7 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s3 -; GFX9-NEXT: s_cselect_b32 s2, s4, 0 -; GFX9-NEXT: v_add_u32_e32 v4, s2, v4 -; GFX9-NEXT: s_sext_i32_i16 s2, s5 -; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s2 +; GFX9-NEXT: s_sext_i32_i16 s1, s7 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s1 +; GFX9-NEXT: s_cselect_b32 s0, s4, 0 +; GFX9-NEXT: v_add_u32_e32 v4, s0, v4 +; GFX9-NEXT: s_sext_i32_i16 s0, s5 +; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v0 -; GFX9-NEXT: s_xor_b32 s2, s2, s3 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 -; GFX9-NEXT: s_or_b32 s4, s2, 1 +; GFX9-NEXT: s_xor_b32 s0, s0, s1 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: s_or_b32 s4, s0, 1 ; GFX9-NEXT: v_mul_f32_e32 v5, v1, v5 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 ; GFX9-NEXT: v_mad_f32 v1, -v5, v0, v1 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GFX9-NEXT: s_cselect_b32 s2, s4, 0 -; GFX9-NEXT: s_ashr_i32 s3, s7, 16 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s3 -; GFX9-NEXT: v_add_u32_e32 v1, s2, v5 -; GFX9-NEXT: s_ashr_i32 s2, s5, 16 -; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s2 +; GFX9-NEXT: s_cselect_b32 s0, s4, 0 +; GFX9-NEXT: s_ashr_i32 s1, s7, 16 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s1 +; GFX9-NEXT: v_add_u32_e32 v1, s0, v5 +; GFX9-NEXT: s_ashr_i32 s0, s5, 16 +; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v0 -; GFX9-NEXT: s_xor_b32 s2, s2, s3 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 -; GFX9-NEXT: s_or_b32 s4, s2, 1 +; GFX9-NEXT: s_xor_b32 s0, s0, s1 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: s_or_b32 s4, s0, 1 ; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6 ; GFX9-NEXT: v_trunc_f32_e32 v6, v6 ; GFX9-NEXT: v_mad_f32 v5, -v6, v0, v5 ; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v5|, |v0| -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX9-NEXT: s_cselect_b32 s2, s4, 0 -; GFX9-NEXT: v_add_u32_e32 v0, s2, v6 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v0| +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cselect_b32 s0, s4, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s0, v6 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v1, v0, 16, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v0 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm %r = sdiv <4 x i16> %x, %y store <4 x i16> %r, ptr addrspace(1) %out @@ -3269,8 +3264,8 @@ define amdgpu_kernel void @srem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX6-LABEL: srem_v4i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -3361,78 +3356,78 @@ define amdgpu_kernel void @srem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX9-LABEL: srem_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sext_i32_i16 s8, s6 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s8 ; GFX9-NEXT: s_sext_i32_i16 s9, s4 ; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s9 -; GFX9-NEXT: s_xor_b32 s2, s9, s8 +; GFX9-NEXT: s_xor_b32 s0, s9, s8 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 -; GFX9-NEXT: s_or_b32 s10, s2, 1 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: s_or_b32 s10, s0, 1 ; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 ; GFX9-NEXT: v_mad_f32 v1, -v3, v0, v1 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX9-NEXT: s_cselect_b32 s2, s10, 0 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cselect_b32 s0, s10, 0 ; GFX9-NEXT: s_ashr_i32 s6, s6, 16 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s6 ; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: v_add_u32_e32 v1, s2, v3 +; GFX9-NEXT: v_add_u32_e32 v1, s0, v3 ; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 -; GFX9-NEXT: s_xor_b32 s2, s4, s6 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 +; GFX9-NEXT: s_xor_b32 s0, s4, s6 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 ; GFX9-NEXT: v_mul_lo_u32 v1, v1, s8 ; GFX9-NEXT: v_mul_f32_e32 v4, v3, v4 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 ; GFX9-NEXT: v_mad_f32 v3, -v4, v0, v3 -; GFX9-NEXT: s_or_b32 s8, s2, 1 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v3|, |v0| +; GFX9-NEXT: s_or_b32 s8, s0, 1 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v0| ; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX9-NEXT: s_cselect_b32 s2, s8, 0 +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cselect_b32 s0, s8, 0 ; GFX9-NEXT: s_sext_i32_i16 s8, s7 ; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s8 -; GFX9-NEXT: v_add_u32_e32 v0, s2, v4 +; GFX9-NEXT: v_add_u32_e32 v0, s0, v4 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, s6 ; GFX9-NEXT: s_sext_i32_i16 s6, s5 ; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s6 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v3 -; GFX9-NEXT: s_xor_b32 s2, s6, s8 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 -; GFX9-NEXT: s_or_b32 s10, s2, 1 +; GFX9-NEXT: s_xor_b32 s0, s6, s8 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: s_or_b32 s10, s0, 1 ; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 ; GFX9-NEXT: v_mad_f32 v4, -v5, v3, v4 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v4|, |v3| -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX9-NEXT: s_cselect_b32 s2, s10, 0 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v3| +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cselect_b32 s0, s10, 0 ; GFX9-NEXT: s_ashr_i32 s7, s7, 16 ; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 ; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s7 ; GFX9-NEXT: s_ashr_i32 s5, s5, 16 ; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 -; GFX9-NEXT: v_add_u32_e32 v3, s2, v5 +; GFX9-NEXT: v_add_u32_e32 v3, s0, v5 ; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s5 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v4 -; GFX9-NEXT: s_xor_b32 s2, s5, s7 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 +; GFX9-NEXT: s_xor_b32 s0, s5, s7 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 ; GFX9-NEXT: v_mul_lo_u32 v3, v3, s8 ; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6 ; GFX9-NEXT: v_trunc_f32_e32 v6, v6 ; GFX9-NEXT: v_mad_f32 v5, -v6, v4, v5 ; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 -; GFX9-NEXT: s_or_b32 s8, s2, 1 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v5|, |v4| -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX9-NEXT: s_cselect_b32 s2, s8, 0 -; GFX9-NEXT: v_add_u32_e32 v4, s2, v6 +; GFX9-NEXT: s_or_b32 s8, s0, 1 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v4| +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cselect_b32 s0, s8, 0 +; GFX9-NEXT: v_add_u32_e32 v4, s0, v6 ; GFX9-NEXT: v_mul_lo_u32 v4, v4, s7 ; GFX9-NEXT: v_sub_u32_e32 v5, s9, v1 ; GFX9-NEXT: v_sub_u32_e32 v1, s6, v3 @@ -3441,7 +3436,7 @@ define amdgpu_kernel void @srem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v5 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm %r = srem <4 x i16> %x, %y store <4 x i16> %r, ptr addrspace(1) %out @@ -3472,8 +3467,8 @@ define amdgpu_kernel void @udiv_i3(ptr addrspace(1) %out, i3 %x, i3 %y) { ; ; GFX6-LABEL: udiv_i3: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_bfe_u32 s2, s4, 0x30008 @@ -3494,15 +3489,15 @@ define amdgpu_kernel void @udiv_i3(ptr addrspace(1) %out, i3 %x, i3 %y) { ; ; GFX9-LABEL: udiv_i3: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s2, s4, 0x30008 -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX9-NEXT: s_bfe_u32 s0, s4, 0x30008 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 -; GFX9-NEXT: s_and_b32 s2, s4, 7 -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, s2 +; GFX9-NEXT: s_and_b32 s0, s4, 7 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, s0 ; GFX9-NEXT: v_mul_f32_e32 v1, v3, v1 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v1 @@ -3510,7 +3505,7 @@ define amdgpu_kernel void @udiv_i3(ptr addrspace(1) %out, i3 %x, i3 %y) { ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 -; GFX9-NEXT: global_store_byte v2, v0, s[0:1] +; GFX9-NEXT: global_store_byte v2, v0, s[2:3] ; GFX9-NEXT: s_endpgm %r = udiv i3 %x, %y store i3 %r, ptr addrspace(1) %out @@ -3543,8 +3538,8 @@ define amdgpu_kernel void @urem_i3(ptr addrspace(1) %out, i3 %x, i3 %y) { ; ; GFX6-LABEL: urem_i3: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_bfe_u32 s2, s4, 0x30008 ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 @@ -3568,24 +3563,24 @@ define amdgpu_kernel void @urem_i3(ptr addrspace(1) %out, i3 %x, i3 %y) { ; ; GFX9-LABEL: urem_i3: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s0, s4, 0x30008 -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s0 +; GFX9-NEXT: s_bfe_u32 s3, s2, 0x30008 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 -; GFX9-NEXT: s_and_b32 s1, s4, 7 -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, s1 -; GFX9-NEXT: s_lshr_b32 s0, s4, 8 +; GFX9-NEXT: s_and_b32 s4, s2, 7 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, s4 +; GFX9-NEXT: s_lshr_b32 s3, s2, 8 ; GFX9-NEXT: v_mul_f32_e32 v1, v2, v1 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v1 ; GFX9-NEXT: v_mad_f32 v1, -v1, v0, v2 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_byte v1, v0, s[0:1] @@ -3623,8 +3618,8 @@ define amdgpu_kernel void @sdiv_i3(ptr addrspace(1) %out, i3 %x, i3 %y) { ; ; GFX6-LABEL: sdiv_i3: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -3650,28 +3645,28 @@ define amdgpu_kernel void @sdiv_i3(ptr addrspace(1) %out, i3 %x, i3 %y) { ; ; GFX9-LABEL: sdiv_i3: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s2, s4, 0x30008 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 -; GFX9-NEXT: s_bfe_i32 s3, s4, 0x30000 -; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s3 -; GFX9-NEXT: s_xor_b32 s2, s3, s2 +; GFX9-NEXT: s_bfe_i32 s0, s4, 0x30008 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX9-NEXT: s_bfe_i32 s1, s4, 0x30000 +; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s1 +; GFX9-NEXT: s_xor_b32 s0, s1, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 -; GFX9-NEXT: s_or_b32 s4, s2, 1 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: s_or_b32 s4, s0, 1 ; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 ; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v2|, |v0| -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX9-NEXT: s_cselect_b32 s2, s4, 0 -; GFX9-NEXT: v_add_u32_e32 v0, s2, v3 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cselect_b32 s0, s4, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s0, v3 ; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 -; GFX9-NEXT: global_store_byte v1, v0, s[0:1] +; GFX9-NEXT: global_store_byte v1, v0, s[2:3] ; GFX9-NEXT: s_endpgm %r = sdiv i3 %x, %y store i3 %r, ptr addrspace(1) %out @@ -3708,8 +3703,8 @@ define amdgpu_kernel void @srem_i3(ptr addrspace(1) %out, i3 %x, i3 %y) { ; ; GFX6-LABEL: srem_i3: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_bfe_i32 s2, s4, 0x30008 ; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s2 @@ -3738,27 +3733,27 @@ define amdgpu_kernel void @srem_i3(ptr addrspace(1) %out, i3 %x, i3 %y) { ; ; GFX9-LABEL: srem_i3: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s0, s4, 0x30008 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 -; GFX9-NEXT: s_bfe_i32 s1, s4, 0x30000 -; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s1 -; GFX9-NEXT: s_xor_b32 s0, s1, s0 +; GFX9-NEXT: s_bfe_i32 s2, s4, 0x30008 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GFX9-NEXT: s_bfe_i32 s3, s4, 0x30000 +; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s3 +; GFX9-NEXT: s_xor_b32 s2, s3, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: s_ashr_i32 s2, s2, 30 ; GFX9-NEXT: s_lshr_b32 s5, s4, 8 -; GFX9-NEXT: s_or_b32 s6, s0, 1 +; GFX9-NEXT: s_or_b32 s6, s2, 1 ; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2 ; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| -; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX9-NEXT: s_cselect_b32 s0, s6, 0 -; GFX9-NEXT: v_add_u32_e32 v0, s0, v2 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_cselect_b32 s2, s6, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s2, v2 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 @@ -3837,8 +3832,8 @@ define amdgpu_kernel void @udiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX6-LABEL: udiv_v3i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -3884,21 +3879,21 @@ define amdgpu_kernel void @udiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX9-LABEL: udiv_v3i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s1, s6, 0xffff -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 -; GFX9-NEXT: s_and_b32 s0, s4, 0xffff +; GFX9-NEXT: s_and_b32 s3, s6, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-NEXT: s_and_b32 s2, s4, 0xffff ; GFX9-NEXT: s_lshr_b32 s6, s6, 16 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s0 +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 ; GFX9-NEXT: s_lshr_b32 s4, s4, 16 ; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 ; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 ; GFX9-NEXT: s_and_b32 s2, s7, 0xffff ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v4 @@ -3923,7 +3918,6 @@ define amdgpu_kernel void @udiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v3, vcc ; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v6, v2, s[0:1] offset:4 ; GFX9-NEXT: global_store_dword v6, v0, s[0:1] ; GFX9-NEXT: s_endpgm @@ -4005,8 +3999,8 @@ define amdgpu_kernel void @urem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX6-LABEL: urem_v3i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -4058,33 +4052,33 @@ define amdgpu_kernel void @urem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX9-LABEL: urem_v3i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s9, s6, 0xffff +; GFX9-NEXT: s_and_b32 s3, s6, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-NEXT: s_and_b32 s2, s4, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s2 ; GFX9-NEXT: s_lshr_b32 s6, s6, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s9 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 -; GFX9-NEXT: s_and_b32 s8, s4, 0xffff ; GFX9-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s8 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 ; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s4 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 -; GFX9-NEXT: s_and_b32 s2, s7, 0xffff ; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 -; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5 -; GFX9-NEXT: v_trunc_f32_e32 v5, v5 -; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v4 +; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 -; GFX9-NEXT: v_mad_f32 v2, -v5, v1, v3 -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s2 -; GFX9-NEXT: s_and_b32 s3, s5, 0xffff +; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v6, vcc +; GFX9-NEXT: v_trunc_f32_e32 v5, v5 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 +; GFX9-NEXT: s_and_b32 s3, s7, 0xffff +; GFX9-NEXT: v_mad_f32 v2, -v5, v1, v3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s3 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v5 -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s5 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v3 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v4, vcc @@ -4093,16 +4087,17 @@ define amdgpu_kernel void @urem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2 ; GFX9-NEXT: v_mad_f32 v2, -v2, v3, v5 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3 -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s9 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v4, vcc ; GFX9-NEXT: v_mul_lo_u32 v1, v1, s6 -; GFX9-NEXT: v_mul_lo_u32 v2, v2, s2 -; GFX9-NEXT: v_sub_u32_e32 v0, s8, v0 +; GFX9-NEXT: v_mul_lo_u32 v2, v2, s3 +; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: v_sub_u32_e32 v1, s4, v1 -; GFX9-NEXT: v_sub_u32_e32 v2, s3, v2 +; GFX9-NEXT: v_sub_u32_e32 v2, s5, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v3, v2, s[0:1] offset:4 ; GFX9-NEXT: global_store_dword v3, v0, s[0:1] ; GFX9-NEXT: s_endpgm @@ -4190,8 +4185,8 @@ define amdgpu_kernel void @sdiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX6-LABEL: sdiv_v3i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -4252,62 +4247,62 @@ define amdgpu_kernel void @sdiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX9-LABEL: sdiv_v3i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sext_i32_i16 s2, s6 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 -; GFX9-NEXT: s_sext_i32_i16 s3, s4 -; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s3 -; GFX9-NEXT: s_xor_b32 s2, s3, s2 +; GFX9-NEXT: s_sext_i32_i16 s0, s6 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX9-NEXT: s_sext_i32_i16 s1, s4 +; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s1 +; GFX9-NEXT: s_xor_b32 s0, s1, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 -; GFX9-NEXT: s_or_b32 s8, s2, 1 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: s_or_b32 s8, s0, 1 ; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 ; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v2|, |v0| -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX9-NEXT: s_cselect_b32 s2, s8, 0 -; GFX9-NEXT: s_ashr_i32 s3, s6, 16 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cselect_b32 s0, s8, 0 +; GFX9-NEXT: s_ashr_i32 s1, s6, 16 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s3 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s1 ; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: v_add_u32_e32 v2, s2, v3 +; GFX9-NEXT: v_add_u32_e32 v2, s0, v3 ; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 -; GFX9-NEXT: s_xor_b32 s2, s4, s3 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 -; GFX9-NEXT: s_or_b32 s4, s2, 1 +; GFX9-NEXT: s_xor_b32 s0, s4, s1 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: s_or_b32 s4, s0, 1 ; GFX9-NEXT: v_mul_f32_e32 v4, v3, v4 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 ; GFX9-NEXT: v_mad_f32 v3, -v4, v0, v3 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v3|, |v0| -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v0| +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GFX9-NEXT: s_sext_i32_i16 s3, s7 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s3 -; GFX9-NEXT: s_cselect_b32 s2, s4, 0 -; GFX9-NEXT: v_add_u32_e32 v3, s2, v4 -; GFX9-NEXT: s_sext_i32_i16 s2, s5 -; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s2 +; GFX9-NEXT: s_sext_i32_i16 s1, s7 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s1 +; GFX9-NEXT: s_cselect_b32 s0, s4, 0 +; GFX9-NEXT: v_add_u32_e32 v3, s0, v4 +; GFX9-NEXT: s_sext_i32_i16 s0, s5 +; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v0 -; GFX9-NEXT: s_xor_b32 s2, s2, s3 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 -; GFX9-NEXT: s_or_b32 s4, s2, 1 +; GFX9-NEXT: s_xor_b32 s0, s0, s1 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: s_or_b32 s4, s0, 1 ; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 ; GFX9-NEXT: v_mad_f32 v4, -v5, v0, v4 ; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v4|, |v0| -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX9-NEXT: s_cselect_b32 s2, s4, 0 -; GFX9-NEXT: v_add_u32_e32 v0, s2, v5 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v0| +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cselect_b32 s0, s4, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s0, v5 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX9-NEXT: global_store_short v1, v0, s[0:1] offset:4 -; GFX9-NEXT: global_store_dword v1, v2, s[0:1] +; GFX9-NEXT: global_store_short v1, v0, s[2:3] offset:4 +; GFX9-NEXT: global_store_dword v1, v2, s[2:3] ; GFX9-NEXT: s_endpgm %r = sdiv <3 x i16> %x, %y store <3 x i16> %r, ptr addrspace(1) %out @@ -4399,8 +4394,8 @@ define amdgpu_kernel void @srem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX6-LABEL: srem_v3i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -4469,8 +4464,7 @@ define amdgpu_kernel void @srem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX9-LABEL: srem_v3i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sext_i32_i16 s8, s6 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s8 @@ -4522,6 +4516,7 @@ define amdgpu_kernel void @srem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec ; GFX9-NEXT: s_cselect_b32 s2, s6, 0 ; GFX9-NEXT: v_add_u32_e32 v2, s2, v4 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mul_lo_u32 v2, v2, s7 ; GFX9-NEXT: v_sub_u32_e32 v1, s9, v1 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 @@ -4529,6 +4524,7 @@ define amdgpu_kernel void @srem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; GFX9-NEXT: v_sub_u32_e32 v2, s5, v2 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v3, v2, s[0:1] offset:4 ; GFX9-NEXT: global_store_dword v3, v0, s[0:1] ; GFX9-NEXT: s_endpgm @@ -4604,31 +4600,33 @@ define amdgpu_kernel void @udiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; ; GFX6-LABEL: udiv_v3i15: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s2, s10, 0x7fff -; GFX6-NEXT: s_and_b32 s3, s0, 0x7fff -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: s_bfe_u32 s0, s0, 0xf000f -; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s2 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: s_and_b32 s5, s8, 0x7fff +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s5 +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_and_b32 s4, s6, 0x7fff +; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s4 +; GFX6-NEXT: s_bfe_u32 s4, s8, 0xf000f ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s0 -; GFX6-NEXT: s_bfe_u32 s3, s10, 0xf000f -; GFX6-NEXT: v_alignbit_b32 v2, s1, v2, 30 +; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s4 +; GFX6-NEXT: s_bfe_u32 s5, s6, 0xf000f +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: v_alignbit_b32 v2, s9, v2, 30 ; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 -; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s3 +; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s5 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v5 ; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4 ; GFX6-NEXT: v_mad_f32 v3, -v4, v1, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, v2 -; GFX6-NEXT: v_mov_b32_e32 v0, s10 -; GFX6-NEXT: v_alignbit_b32 v0, s11, v0, 30 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_alignbit_b32 v0, s7, v0, 30 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 ; GFX6-NEXT: v_mul_f32_e32 v1, v6, v7 ; GFX6-NEXT: v_and_b32_e32 v0, 0x7fff, v0 @@ -4651,33 +4649,31 @@ define amdgpu_kernel void @udiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX6-NEXT: s_mov_b32 s4, s8 -; GFX6-NEXT: s_mov_b32 s5, s9 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 -; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 +; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: udiv_v3i15: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s2, s6, 0x7fff -; GFX9-NEXT: s_and_b32 s3, s0, 0x7fff -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: s_bfe_u32 s0, s0, 0xf000f -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s2 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 +; GFX9-NEXT: s_and_b32 s0, s6, 0x7fff +; GFX9-NEXT: s_and_b32 s1, s2, 0x7fff +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s1 +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s0 +; GFX9-NEXT: s_bfe_u32 s0, s2, 0xf000f ; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s0 -; GFX9-NEXT: s_bfe_u32 s3, s6, 0xf000f -; GFX9-NEXT: v_alignbit_b32 v3, s1, v3, 30 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 +; GFX9-NEXT: s_bfe_u32 s1, s6, 0xf000f +; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_alignbit_b32 v3, s3, v3, 30 ; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 -; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v6 ; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v3 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 @@ -4791,41 +4787,41 @@ define amdgpu_kernel void @urem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; ; GFX6-LABEL: urem_v3i15: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, s8 -; GFX6-NEXT: s_and_b32 s8, s0, 0x7fff -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s8 -; GFX6-NEXT: s_and_b32 s3, s10, 0x7fff -; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s3 -; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_alignbit_b32 v0, s7, v0, 30 +; GFX6-NEXT: s_and_b32 s7, s8, 0x7fff +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s7 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: s_and_b32 s5, s6, 0x7fff +; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s5 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v1 -; GFX6-NEXT: v_alignbit_b32 v2, s1, v2, 30 -; GFX6-NEXT: s_bfe_u32 s1, s0, 0xf000f -; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s1 +; GFX6-NEXT: s_bfe_u32 s5, s8, 0xf000f +; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s5 +; GFX6-NEXT: s_bfe_u32 s7, s6, 0xf000f ; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4 ; GFX6-NEXT: v_mad_f32 v3, -v4, v1, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 -; GFX6-NEXT: s_bfe_u32 s8, s10, 0xf000f -; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s8 +; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s7 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc -; GFX6-NEXT: v_mul_lo_u32 v1, v1, s0 +; GFX6-NEXT: v_mul_lo_u32 v1, v1, s8 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v5 +; GFX6-NEXT: v_alignbit_b32 v2, s9, v2, 30 ; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2 -; GFX6-NEXT: v_mov_b32_e32 v0, s10 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s10, v1 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s6, v1 ; GFX6-NEXT: v_mul_f32_e32 v1, v3, v4 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, v2 -; GFX6-NEXT: v_alignbit_b32 v0, s11, v0, 30 ; GFX6-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX6-NEXT: v_cvt_f32_u32_e32 v7, v0 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v4 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v4 ; GFX6-NEXT: v_mad_f32 v3, -v1, v5, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 @@ -4834,32 +4830,32 @@ define amdgpu_kernel void @urem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v3 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX6-NEXT: v_mad_f32 v3, -v3, v4, v7 -; GFX6-NEXT: s_lshr_b32 s0, s0, 15 +; GFX6-NEXT: s_lshr_b32 s5, s8, 15 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 -; GFX6-NEXT: v_mul_lo_u32 v1, v1, s0 +; GFX6-NEXT: v_mul_lo_u32 v1, v1, s5 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc ; GFX6-NEXT: v_mul_lo_u32 v2, v3, v2 -; GFX6-NEXT: s_lshr_b32 s2, s10, 15 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s2, v1 +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_lshr_b32 s4, s6, 15 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s4, v1 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v3 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 ; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX6-NEXT: s_mov_b32 s5, s9 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 -; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 +; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: urem_v3i15: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_alignbit_b32 v0, s7, v0, 30 @@ -5000,50 +4996,52 @@ define amdgpu_kernel void @sdiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; ; GFX6-LABEL: sdiv_v3i15: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s10 -; GFX6-NEXT: s_bfe_i32 s2, s0, 0xf0000 -; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s2 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_alignbit_b32 v1, s1, v1, 30 -; GFX6-NEXT: s_bfe_i32 s1, s10, 0xf0000 -; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s1 +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_bfe_i32 s4, s8, 0xf0000 +; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: s_bfe_i32 s5, s6, 0xf0000 +; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s5 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 -; GFX6-NEXT: s_xor_b32 s1, s1, s2 -; GFX6-NEXT: s_ashr_i32 s1, s1, 30 -; GFX6-NEXT: s_or_b32 s1, s1, 1 +; GFX6-NEXT: s_xor_b32 s4, s5, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_ashr_i32 s4, s4, 30 ; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4 ; GFX6-NEXT: v_mad_f32 v3, -v4, v2, v3 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[2:3], |v3|, |v2| -; GFX6-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX6-NEXT: v_alignbit_b32 v0, s7, v0, 30 +; GFX6-NEXT: s_or_b32 s7, s4, 1 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, |v2| +; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GFX6-NEXT: s_cselect_b32 s1, s1, 0 -; GFX6-NEXT: s_bfe_i32 s0, s0, 0xf000f -; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s0 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, s1, v4 -; GFX6-NEXT: s_bfe_i32 s1, s10, 0xf000f -; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s1 +; GFX6-NEXT: s_cselect_b32 s4, s7, 0 +; GFX6-NEXT: s_bfe_i32 s5, s8, 0xf000f +; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s5 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, s4, v4 +; GFX6-NEXT: s_bfe_i32 s4, s6, 0xf000f +; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 -; GFX6-NEXT: s_xor_b32 s0, s1, s0 -; GFX6-NEXT: s_ashr_i32 s0, s0, 30 -; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 15 +; GFX6-NEXT: v_mov_b32_e32 v1, s8 +; GFX6-NEXT: v_alignbit_b32 v1, s9, v1, 30 +; GFX6-NEXT: s_xor_b32 s4, s4, s5 ; GFX6-NEXT: v_mul_f32_e32 v5, v4, v5 ; GFX6-NEXT: v_trunc_f32_e32 v5, v5 +; GFX6-NEXT: s_ashr_i32 s4, s4, 30 ; GFX6-NEXT: v_mad_f32 v4, -v5, v2, v4 -; GFX6-NEXT: s_or_b32 s2, s0, 1 +; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 15 +; GFX6-NEXT: s_or_b32 s6, s4, 1 ; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v2| +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v4|, |v2| ; GFX6-NEXT: v_cvt_f32_i32_e32 v2, v1 -; GFX6-NEXT: v_alignbit_b32 v0, s11, v0, 30 -; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX6-NEXT: s_cselect_b32 s0, s2, 0 +; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX6-NEXT: s_cselect_b32 s4, s6, 0 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 15 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, s0, v5 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, s4, v5 ; GFX6-NEXT: v_cvt_f32_i32_e32 v5, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v2 ; GFX6-NEXT: v_xor_b32_e32 v0, v0, v1 @@ -5061,46 +5059,43 @@ define amdgpu_kernel void @sdiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX6-NEXT: s_mov_b32 s4, s8 -; GFX6-NEXT: s_mov_b32 s5, s9 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 -; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 +; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_v3i15: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_bfe_i32 s2, s0, 0xf0000 -; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_alignbit_b32 v1, s1, v1, 30 ; GFX9-NEXT: s_bfe_i32 s1, s6, 0xf0000 +; GFX9-NEXT: s_bfe_i32 s0, s2, 0xf0000 +; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s0 ; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s1 +; GFX9-NEXT: s_xor_b32 s0, s1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v3 -; GFX9-NEXT: s_xor_b32 s1, s1, s2 -; GFX9-NEXT: s_ashr_i32 s1, s1, 30 -; GFX9-NEXT: s_or_b32 s1, s1, 1 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: v_alignbit_b32 v1, s3, v1, 30 +; GFX9-NEXT: s_or_b32 s3, s0, 1 ; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 ; GFX9-NEXT: v_mad_f32 v4, -v5, v3, v4 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v4|, |v3| -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v3| +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GFX9-NEXT: s_cselect_b32 s1, s1, 0 -; GFX9-NEXT: s_bfe_i32 s0, s0, 0xf000f -; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s0 -; GFX9-NEXT: v_add_u32_e32 v4, s1, v5 -; GFX9-NEXT: s_bfe_i32 s1, s6, 0xf000f -; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s1 +; GFX9-NEXT: s_cselect_b32 s0, s3, 0 +; GFX9-NEXT: s_bfe_i32 s1, s2, 0xf000f +; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s1 +; GFX9-NEXT: v_add_u32_e32 v4, s0, v5 +; GFX9-NEXT: s_bfe_i32 s0, s6, 0xf000f +; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v3 -; GFX9-NEXT: s_xor_b32 s0, s1, s0 +; GFX9-NEXT: s_xor_b32 s0, s0, s1 ; GFX9-NEXT: s_ashr_i32 s0, s0, 30 ; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 15 ; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6 @@ -5110,6 +5105,7 @@ define amdgpu_kernel void @sdiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v3| ; GFX9-NEXT: v_cvt_f32_i32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_alignbit_b32 v0, s7, v0, 30 ; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: s_cselect_b32 s0, s2, 0 @@ -5227,73 +5223,73 @@ define amdgpu_kernel void @srem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; ; GFX6-LABEL: srem_v3i15: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_i32 s2, s10, 0xf0000 -; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: v_alignbit_b32 v2, s1, v2, 30 -; GFX6-NEXT: s_bfe_i32 s1, s0, 0xf0000 -; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s1 -; GFX6-NEXT: v_cvt_f32_i32_e32 v5, s2 -; GFX6-NEXT: s_xor_b32 s1, s2, s1 -; GFX6-NEXT: s_ashr_i32 s1, s1, 30 +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_bfe_i32 s4, s8, 0xf0000 +; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: s_bfe_i32 s5, s6, 0xf0000 +; GFX6-NEXT: v_cvt_f32_i32_e32 v5, s5 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 -; GFX6-NEXT: s_mov_b32 s4, s8 -; GFX6-NEXT: s_mov_b32 s5, s9 -; GFX6-NEXT: s_lshr_b32 s8, s10, 15 +; GFX6-NEXT: s_xor_b32 s4, s5, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: v_mul_f32_e32 v6, v5, v6 ; GFX6-NEXT: v_trunc_f32_e32 v6, v6 +; GFX6-NEXT: s_ashr_i32 s4, s4, 30 ; GFX6-NEXT: v_mad_f32 v5, -v6, v4, v5 ; GFX6-NEXT: v_cvt_i32_f32_e32 v6, v6 -; GFX6-NEXT: s_lshr_b32 s9, s0, 15 -; GFX6-NEXT: s_or_b32 s1, s1, 1 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[2:3], |v5|, |v4| -; GFX6-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX6-NEXT: s_cselect_b32 s1, s1, 0 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, s1, v6 -; GFX6-NEXT: v_mul_lo_u32 v4, v4, s0 -; GFX6-NEXT: s_bfe_i32 s0, s0, 0xf000f -; GFX6-NEXT: v_cvt_f32_i32_e32 v5, s0 -; GFX6-NEXT: s_bfe_i32 s1, s10, 0xf000f -; GFX6-NEXT: v_cvt_f32_i32_e32 v6, s1 -; GFX6-NEXT: s_xor_b32 s0, s1, s0 +; GFX6-NEXT: v_alignbit_b32 v0, s7, v0, 30 +; GFX6-NEXT: s_lshr_b32 s7, s6, 15 +; GFX6-NEXT: v_alignbit_b32 v2, s9, v2, 30 +; GFX6-NEXT: s_lshr_b32 s9, s8, 15 +; GFX6-NEXT: s_or_b32 s10, s4, 1 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, |v4| +; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX6-NEXT: s_cselect_b32 s4, s10, 0 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, s4, v6 +; GFX6-NEXT: s_bfe_i32 s4, s8, 0xf000f +; GFX6-NEXT: v_cvt_f32_i32_e32 v5, s4 +; GFX6-NEXT: s_bfe_i32 s5, s6, 0xf000f +; GFX6-NEXT: v_cvt_f32_i32_e32 v6, s5 +; GFX6-NEXT: v_mul_lo_u32 v4, v4, s8 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v5 +; GFX6-NEXT: s_xor_b32 s4, s5, s4 ; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v2 -; GFX6-NEXT: s_ashr_i32 s0, s0, 30 -; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 15 +; GFX6-NEXT: s_ashr_i32 s4, s4, 30 ; GFX6-NEXT: v_mul_f32_e32 v7, v6, v7 ; GFX6-NEXT: v_trunc_f32_e32 v7, v7 ; GFX6-NEXT: v_mad_f32 v6, -v7, v5, v6 -; GFX6-NEXT: s_or_b32 s2, s0, 1 +; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 15 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s6, v4 +; GFX6-NEXT: s_or_b32 s6, s4, 1 ; GFX6-NEXT: v_cvt_i32_f32_e32 v7, v7 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v6|, |v5| +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v6|, |v5| ; GFX6-NEXT: v_cvt_f32_i32_e32 v6, v2 -; GFX6-NEXT: v_mov_b32_e32 v0, s10 -; GFX6-NEXT: v_alignbit_b32 v0, s11, v0, 30 -; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GFX6-NEXT: v_and_b32_e32 v1, 0x7fff, v0 -; GFX6-NEXT: s_cselect_b32 s0, s2, 0 +; GFX6-NEXT: s_cselect_b32 s4, s6, 0 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 15 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, s0, v7 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, s4, v7 ; GFX6-NEXT: v_cvt_f32_i32_e32 v7, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v6 ; GFX6-NEXT: v_xor_b32_e32 v0, v0, v2 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s10, v4 +; GFX6-NEXT: v_ashrrev_i32_e32 v0, 30, v0 +; GFX6-NEXT: v_or_b32_e32 v0, 1, v0 ; GFX6-NEXT: v_mul_f32_e32 v2, v7, v8 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 ; GFX6-NEXT: v_mad_f32 v7, -v2, v6, v7 ; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 30, v0 -; GFX6-NEXT: v_or_b32_e32 v0, 1, v0 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, |v6| ; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GFX6-NEXT: v_mul_lo_u32 v5, v5, s9 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, v3 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s8, v5 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s7, v5 ; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v1, v0 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 @@ -5301,54 +5297,54 @@ define amdgpu_kernel void @srem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 15, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 -; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 +; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: srem_v3i15: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s2, s6, 0xf0000 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_alignbit_b32 v1, s1, v1, 30 -; GFX9-NEXT: s_bfe_i32 s1, s0, 0xf0000 -; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s1 -; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s2 -; GFX9-NEXT: s_xor_b32 s1, s2, s1 +; GFX9-NEXT: s_bfe_i32 s1, s6, 0xf0000 +; GFX9-NEXT: s_bfe_i32 s0, s2, 0xf0000 +; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s0 +; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s1 +; GFX9-NEXT: s_xor_b32 s0, s1, s0 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v4 -; GFX9-NEXT: s_ashr_i32 s1, s1, 30 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 ; GFX9-NEXT: s_lshr_b32 s8, s6, 15 -; GFX9-NEXT: v_alignbit_b32 v0, s7, v0, 30 ; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6 ; GFX9-NEXT: v_trunc_f32_e32 v6, v6 ; GFX9-NEXT: v_mad_f32 v5, -v6, v4, v5 ; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 -; GFX9-NEXT: s_lshr_b32 s7, s0, 15 -; GFX9-NEXT: s_or_b32 s1, s1, 1 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v5|, |v4| -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX9-NEXT: s_cselect_b32 s1, s1, 0 -; GFX9-NEXT: v_add_u32_e32 v4, s1, v6 -; GFX9-NEXT: s_bfe_i32 s1, s0, 0xf000f -; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s1 -; GFX9-NEXT: v_mul_lo_u32 v4, v4, s0 -; GFX9-NEXT: s_bfe_i32 s0, s6, 0xf000f -; GFX9-NEXT: v_cvt_f32_i32_e32 v6, s0 +; GFX9-NEXT: v_alignbit_b32 v0, s7, v0, 30 +; GFX9-NEXT: v_alignbit_b32 v1, s3, v1, 30 +; GFX9-NEXT: s_lshr_b32 s3, s2, 15 +; GFX9-NEXT: s_or_b32 s7, s0, 1 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v4| +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cselect_b32 s0, s7, 0 +; GFX9-NEXT: v_add_u32_e32 v4, s0, v6 +; GFX9-NEXT: s_bfe_i32 s0, s2, 0xf000f +; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s0 +; GFX9-NEXT: s_bfe_i32 s1, s6, 0xf000f +; GFX9-NEXT: v_cvt_f32_i32_e32 v6, s1 +; GFX9-NEXT: s_xor_b32 s0, s1, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v5 -; GFX9-NEXT: s_xor_b32 s0, s0, s1 ; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v1 ; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 15 ; GFX9-NEXT: v_mul_f32_e32 v7, v6, v7 ; GFX9-NEXT: v_trunc_f32_e32 v7, v7 ; GFX9-NEXT: v_mad_f32 v6, -v7, v5, v6 ; GFX9-NEXT: v_cvt_i32_f32_e32 v7, v7 -; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 15 +; GFX9-NEXT: v_mul_lo_u32 v4, v4, s2 ; GFX9-NEXT: s_or_b32 s2, s0, 1 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v6|, |v5| ; GFX9-NEXT: v_cvt_f32_i32_e32 v6, v1 @@ -5367,7 +5363,7 @@ define amdgpu_kernel void @srem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX9-NEXT: v_mad_f32 v7, -v7, v6, v8 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, |v6| ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX9-NEXT: v_mul_lo_u32 v5, v5, s7 +; GFX9-NEXT: v_mul_lo_u32 v5, v5, s3 ; GFX9-NEXT: v_add_u32_e32 v1, v9, v1 ; GFX9-NEXT: v_mul_lo_u32 v1, v1, v3 ; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0 @@ -5397,8 +5393,8 @@ define amdgpu_kernel void @udiv_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX6-LABEL: udiv_i32_oddk_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v0, 0xb2a50881 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -5413,17 +5409,17 @@ define amdgpu_kernel void @udiv_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX9-LABEL: udiv_i32_oddk_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_hi_u32 s2, s4, 0xb2a50881 -; GFX9-NEXT: s_sub_i32 s3, s4, s2 -; GFX9-NEXT: s_lshr_b32 s3, s3, 1 -; GFX9-NEXT: s_add_i32 s3, s3, s2 -; GFX9-NEXT: s_lshr_b32 s2, s3, 20 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_mul_hi_u32 s0, s4, 0xb2a50881 +; GFX9-NEXT: s_sub_i32 s1, s4, s0 +; GFX9-NEXT: s_lshr_b32 s1, s1, 1 +; GFX9-NEXT: s_add_i32 s1, s1, s0 +; GFX9-NEXT: s_lshr_b32 s0, s1, 20 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm %r = udiv i32 %x, 1235195 store i32 %r, ptr addrspace(1) %out @@ -5438,8 +5434,8 @@ define amdgpu_kernel void @udiv_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX6-LABEL: udiv_i32_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5450,13 +5446,13 @@ define amdgpu_kernel void @udiv_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX9-LABEL: udiv_i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s2, s4, 12 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_lshr_b32 s0, s4, 12 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm %r = udiv i32 %x, 4096 store i32 %r, ptr addrspace(1) %out @@ -5472,7 +5468,7 @@ define amdgpu_kernel void @udiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; ; GFX6-LABEL: udiv_i32_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5486,13 +5482,13 @@ define amdgpu_kernel void @udiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; ; GFX9-LABEL: udiv_i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_i32 s3, s3, 12 -; GFX9-NEXT: s_lshr_b32 s2, s2, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_add_i32 s0, s7, 12 +; GFX9-NEXT: s_lshr_b32 s0, s6, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm %shl.y = shl i32 4096, %y %r = udiv i32 %x, %shl.y @@ -5513,7 +5509,7 @@ define amdgpu_kernel void @udiv_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3 ; ; GFX6-LABEL: udiv_v2i32_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5528,14 +5524,14 @@ define amdgpu_kernel void @udiv_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3 ; ; GFX9-LABEL: udiv_v2i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s2, s2, 12 -; GFX9-NEXT: s_lshr_b32 s3, s3, 12 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_lshr_b32 s0, s6, 12 +; GFX9-NEXT: s_lshr_b32 s1, s7, 12 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm %r = udiv <2 x i32> %x, store <2 x i32> %r, ptr addrspace(1) %out @@ -5555,7 +5551,7 @@ define amdgpu_kernel void @udiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out, < ; ; GFX6-LABEL: udiv_v2i32_mixed_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v0, 0x100101 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 @@ -5574,18 +5570,18 @@ define amdgpu_kernel void @udiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out, < ; ; GFX9-LABEL: udiv_v2i32_mixed_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_hi_u32 s4, s3, 0x100101 -; GFX9-NEXT: s_sub_i32 s3, s3, s4 -; GFX9-NEXT: s_lshr_b32 s3, s3, 1 -; GFX9-NEXT: s_add_i32 s3, s3, s4 -; GFX9-NEXT: s_lshr_b32 s2, s2, 12 -; GFX9-NEXT: s_lshr_b32 s3, s3, 11 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_mul_hi_u32 s1, s7, 0x100101 +; GFX9-NEXT: s_sub_i32 s2, s7, s1 +; GFX9-NEXT: s_lshr_b32 s2, s2, 1 +; GFX9-NEXT: s_add_i32 s2, s2, s1 +; GFX9-NEXT: s_lshr_b32 s0, s6, 12 +; GFX9-NEXT: s_lshr_b32 s1, s2, 11 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm %r = udiv <2 x i32> %x, store <2 x i32> %r, ptr addrspace(1) %out @@ -5664,42 +5660,42 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX6-LABEL: udiv_v2i32_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s11, 0xf000 ; GFX6-NEXT: s_mov_b32 s10, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s0, 0x1000, s6 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GFX6-NEXT: s_sub_i32 s1, 0, s0 +; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s6 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX6-NEXT: s_sub_i32 s3, 0, s2 ; GFX6-NEXT: s_lshl_b32 s6, 0x1000, s7 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s6 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s1, v0 +; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_readfirstlane_b32 s1, v0 -; GFX6-NEXT: s_mul_i32 s1, s1, s0 -; GFX6-NEXT: s_sub_i32 s1, s4, s1 -; GFX6-NEXT: s_sub_i32 s4, s1, s0 -; GFX6-NEXT: s_cmp_ge_u32 s1, s0 +; GFX6-NEXT: v_readfirstlane_b32 s3, v0 +; GFX6-NEXT: s_mul_i32 s3, s3, s2 +; GFX6-NEXT: s_sub_i32 s3, s4, s3 +; GFX6-NEXT: s_sub_i32 s4, s3, s2 +; GFX6-NEXT: s_cmp_ge_u32 s3, s2 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; GFX6-NEXT: s_cselect_b32 s1, s4, s1 +; GFX6-NEXT: s_cselect_b32 s3, s4, s3 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX6-NEXT: s_cmp_ge_u32 s1, s0 -; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s3, s2 +; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX6-NEXT: s_sub_i32 s4, 0, s6 ; GFX6-NEXT: v_mul_lo_u32 v3, s4, v1 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 ; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[2:3] ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 ; GFX6-NEXT: v_readfirstlane_b32 s0, v1 @@ -5720,54 +5716,54 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX9-LABEL: udiv_v2i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b32 s7, 0x1000, s7 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 -; GFX9-NEXT: s_sub_i32 s2, 0, s6 +; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-NEXT: s_lshl_b32 s2, 0x1000, s7 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2 +; GFX9-NEXT: s_sub_i32 s6, 0, s3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_readfirstlane_b32 s3, v0 -; GFX9-NEXT: s_mul_i32 s2, s2, s3 -; GFX9-NEXT: s_mul_hi_u32 s2, s3, s2 -; GFX9-NEXT: s_add_i32 s3, s3, s2 -; GFX9-NEXT: s_mul_hi_u32 s2, s4, s3 -; GFX9-NEXT: s_mul_i32 s3, s2, s6 -; GFX9-NEXT: s_sub_i32 s3, s4, s3 -; GFX9-NEXT: s_add_i32 s9, s2, 1 -; GFX9-NEXT: s_sub_i32 s4, s3, s6 -; GFX9-NEXT: s_cmp_ge_u32 s3, s6 -; GFX9-NEXT: s_cselect_b32 s2, s9, s2 -; GFX9-NEXT: s_cselect_b32 s3, s4, s3 -; GFX9-NEXT: s_add_i32 s4, s2, 1 -; GFX9-NEXT: s_cmp_ge_u32 s3, s6 +; GFX9-NEXT: v_readfirstlane_b32 s7, v0 +; GFX9-NEXT: s_mul_i32 s6, s6, s7 +; GFX9-NEXT: s_mul_hi_u32 s6, s7, s6 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_mul_hi_u32 s6, s4, s7 +; GFX9-NEXT: s_mul_i32 s7, s6, s3 +; GFX9-NEXT: s_sub_i32 s4, s4, s7 +; GFX9-NEXT: s_add_i32 s9, s6, 1 +; GFX9-NEXT: s_sub_i32 s7, s4, s3 +; GFX9-NEXT: s_cmp_ge_u32 s4, s3 +; GFX9-NEXT: s_cselect_b32 s6, s9, s6 +; GFX9-NEXT: s_cselect_b32 s4, s7, s4 +; GFX9-NEXT: s_add_i32 s7, s6, 1 +; GFX9-NEXT: s_cmp_ge_u32 s4, s3 ; GFX9-NEXT: v_readfirstlane_b32 s8, v1 -; GFX9-NEXT: s_cselect_b32 s2, s4, s2 -; GFX9-NEXT: s_sub_i32 s3, 0, s7 -; GFX9-NEXT: s_mul_i32 s3, s3, s8 -; GFX9-NEXT: s_mul_hi_u32 s3, s8, s3 -; GFX9-NEXT: s_add_i32 s8, s8, s3 -; GFX9-NEXT: s_mul_hi_u32 s3, s5, s8 -; GFX9-NEXT: s_mul_i32 s4, s3, s7 -; GFX9-NEXT: s_sub_i32 s4, s5, s4 -; GFX9-NEXT: s_add_i32 s6, s3, 1 -; GFX9-NEXT: s_sub_i32 s5, s4, s7 -; GFX9-NEXT: s_cmp_ge_u32 s4, s7 -; GFX9-NEXT: s_cselect_b32 s3, s6, s3 -; GFX9-NEXT: s_cselect_b32 s4, s5, s4 -; GFX9-NEXT: s_add_i32 s5, s3, 1 -; GFX9-NEXT: s_cmp_ge_u32 s4, s7 -; GFX9-NEXT: s_cselect_b32 s3, s5, s3 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_cselect_b32 s3, s7, s6 +; GFX9-NEXT: s_sub_i32 s4, 0, s2 +; GFX9-NEXT: s_mul_i32 s4, s4, s8 +; GFX9-NEXT: s_mul_hi_u32 s4, s8, s4 +; GFX9-NEXT: s_add_i32 s8, s8, s4 +; GFX9-NEXT: s_mul_hi_u32 s4, s5, s8 +; GFX9-NEXT: s_mul_i32 s6, s4, s2 +; GFX9-NEXT: s_sub_i32 s5, s5, s6 +; GFX9-NEXT: s_add_i32 s7, s4, 1 +; GFX9-NEXT: s_sub_i32 s6, s5, s2 +; GFX9-NEXT: s_cmp_ge_u32 s5, s2 +; GFX9-NEXT: s_cselect_b32 s4, s7, s4 +; GFX9-NEXT: s_cselect_b32 s5, s6, s5 +; GFX9-NEXT: s_add_i32 s6, s4, 1 +; GFX9-NEXT: s_cmp_ge_u32 s5, s2 +; GFX9-NEXT: s_cselect_b32 s2, s6, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %shl.y = shl <2 x i32> , %y @@ -5784,10 +5780,10 @@ define amdgpu_kernel void @urem_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX6-LABEL: urem_i32_oddk_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb ; GFX6-NEXT: v_mov_b32_e32 v0, 0xb2a50881 ; GFX6-NEXT: s_mov_b32 s2, 0x12d8fb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 @@ -5803,19 +5799,19 @@ define amdgpu_kernel void @urem_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX9-LABEL: urem_i32_oddk_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_hi_u32 s2, s4, 0xb2a50881 -; GFX9-NEXT: s_sub_i32 s3, s4, s2 -; GFX9-NEXT: s_lshr_b32 s3, s3, 1 -; GFX9-NEXT: s_add_i32 s3, s3, s2 -; GFX9-NEXT: s_lshr_b32 s2, s3, 20 -; GFX9-NEXT: s_mul_i32 s2, s2, 0x12d8fb -; GFX9-NEXT: s_sub_i32 s2, s4, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_mul_hi_u32 s0, s4, 0xb2a50881 +; GFX9-NEXT: s_sub_i32 s1, s4, s0 +; GFX9-NEXT: s_lshr_b32 s1, s1, 1 +; GFX9-NEXT: s_add_i32 s1, s1, s0 +; GFX9-NEXT: s_lshr_b32 s0, s1, 20 +; GFX9-NEXT: s_mul_i32 s0, s0, 0x12d8fb +; GFX9-NEXT: s_sub_i32 s0, s4, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm %r = urem i32 %x, 1235195 store i32 %r, ptr addrspace(1) %out @@ -5830,8 +5826,8 @@ define amdgpu_kernel void @urem_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX6-LABEL: urem_i32_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5842,13 +5838,13 @@ define amdgpu_kernel void @urem_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX9-LABEL: urem_i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s2, s4, 0xfff -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_and_b32 s0, s4, 0xfff +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm %r = urem i32 %x, 4096 store i32 %r, ptr addrspace(1) %out @@ -5864,7 +5860,7 @@ define amdgpu_kernel void @urem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; ; GFX6-LABEL: urem_i32_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5879,14 +5875,14 @@ define amdgpu_kernel void @urem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; ; GFX9-LABEL: urem_i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3 -; GFX9-NEXT: s_add_i32 s3, s3, -1 -; GFX9-NEXT: s_and_b32 s2, s2, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_lshl_b32 s0, 0x1000, s7 +; GFX9-NEXT: s_add_i32 s0, s0, -1 +; GFX9-NEXT: s_and_b32 s0, s6, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm %shl.y = shl i32 4096, %y %r = urem i32 %x, %shl.y @@ -5907,7 +5903,7 @@ define amdgpu_kernel void @urem_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3 ; ; GFX6-LABEL: urem_v2i32_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5922,14 +5918,14 @@ define amdgpu_kernel void @urem_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3 ; ; GFX9-LABEL: urem_v2i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s2, s2, 0xfff -; GFX9-NEXT: s_and_b32 s3, s3, 0xfff -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_and_b32 s0, s6, 0xfff +; GFX9-NEXT: s_and_b32 s1, s7, 0xfff +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm %r = urem <2 x i32> %x, store <2 x i32> %r, ptr addrspace(1) %out @@ -6004,35 +6000,35 @@ define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX6-LABEL: urem_v2i32_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s0, 0x1000, s6 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GFX6-NEXT: s_sub_i32 s1, 0, s0 +; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s6 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX6-NEXT: s_sub_i32 s3, 0, s2 ; GFX6-NEXT: s_lshl_b32 s6, 0x1000, s7 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s6 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s1, v0 +; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_readfirstlane_b32 s1, v0 -; GFX6-NEXT: s_mul_i32 s1, s1, s0 -; GFX6-NEXT: s_sub_i32 s1, s4, s1 -; GFX6-NEXT: s_sub_i32 s4, s1, s0 -; GFX6-NEXT: s_cmp_ge_u32 s1, s0 -; GFX6-NEXT: s_cselect_b32 s1, s4, s1 -; GFX6-NEXT: s_sub_i32 s4, s1, s0 -; GFX6-NEXT: s_cmp_ge_u32 s1, s0 -; GFX6-NEXT: s_cselect_b32 s4, s4, s1 -; GFX6-NEXT: s_sub_i32 s0, 0, s6 -; GFX6-NEXT: v_mul_lo_u32 v0, s0, v1 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: v_readfirstlane_b32 s3, v0 +; GFX6-NEXT: s_mul_i32 s3, s3, s2 +; GFX6-NEXT: s_sub_i32 s3, s4, s3 +; GFX6-NEXT: s_sub_i32 s4, s3, s2 +; GFX6-NEXT: s_cmp_ge_u32 s3, s2 +; GFX6-NEXT: s_cselect_b32 s3, s4, s3 +; GFX6-NEXT: s_sub_i32 s4, s3, s2 +; GFX6-NEXT: s_cmp_ge_u32 s3, s2 +; GFX6-NEXT: s_cselect_b32 s4, s4, s3 +; GFX6-NEXT: s_sub_i32 s2, 0, s6 +; GFX6-NEXT: v_mul_lo_u32 v0, s2, v1 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 @@ -6049,56 +6045,55 @@ define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_cselect_b32 s5, s7, s5 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: urem_v2i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b32 s7, 0x1000, s7 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 -; GFX9-NEXT: s_sub_i32 s2, 0, s6 +; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-NEXT: s_lshl_b32 s2, 0x1000, s7 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2 +; GFX9-NEXT: s_sub_i32 s6, 0, s3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_readfirstlane_b32 s3, v0 -; GFX9-NEXT: s_mul_i32 s2, s2, s3 -; GFX9-NEXT: s_mul_hi_u32 s2, s3, s2 -; GFX9-NEXT: s_add_i32 s3, s3, s2 -; GFX9-NEXT: s_mul_hi_u32 s2, s4, s3 -; GFX9-NEXT: s_mul_i32 s2, s2, s6 -; GFX9-NEXT: s_sub_i32 s2, s4, s2 -; GFX9-NEXT: s_sub_i32 s3, s2, s6 -; GFX9-NEXT: s_cmp_ge_u32 s2, s6 -; GFX9-NEXT: s_cselect_b32 s2, s3, s2 -; GFX9-NEXT: s_sub_i32 s3, s2, s6 -; GFX9-NEXT: s_cmp_ge_u32 s2, s6 +; GFX9-NEXT: v_readfirstlane_b32 s7, v0 +; GFX9-NEXT: s_mul_i32 s6, s6, s7 +; GFX9-NEXT: s_mul_hi_u32 s6, s7, s6 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_mul_hi_u32 s6, s4, s7 +; GFX9-NEXT: s_mul_i32 s6, s6, s3 +; GFX9-NEXT: s_sub_i32 s4, s4, s6 +; GFX9-NEXT: s_sub_i32 s6, s4, s3 +; GFX9-NEXT: s_cmp_ge_u32 s4, s3 +; GFX9-NEXT: s_cselect_b32 s4, s6, s4 +; GFX9-NEXT: s_sub_i32 s6, s4, s3 +; GFX9-NEXT: s_cmp_ge_u32 s4, s3 ; GFX9-NEXT: v_readfirstlane_b32 s8, v1 -; GFX9-NEXT: s_cselect_b32 s2, s3, s2 -; GFX9-NEXT: s_sub_i32 s3, 0, s7 -; GFX9-NEXT: s_mul_i32 s3, s3, s8 -; GFX9-NEXT: s_mul_hi_u32 s3, s8, s3 -; GFX9-NEXT: s_add_i32 s8, s8, s3 -; GFX9-NEXT: s_mul_hi_u32 s3, s5, s8 -; GFX9-NEXT: s_mul_i32 s3, s3, s7 -; GFX9-NEXT: s_sub_i32 s3, s5, s3 -; GFX9-NEXT: s_sub_i32 s4, s3, s7 -; GFX9-NEXT: s_cmp_ge_u32 s3, s7 -; GFX9-NEXT: s_cselect_b32 s3, s4, s3 -; GFX9-NEXT: s_sub_i32 s4, s3, s7 -; GFX9-NEXT: s_cmp_ge_u32 s3, s7 -; GFX9-NEXT: s_cselect_b32 s3, s4, s3 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_cselect_b32 s3, s6, s4 +; GFX9-NEXT: s_sub_i32 s4, 0, s2 +; GFX9-NEXT: s_mul_i32 s4, s4, s8 +; GFX9-NEXT: s_mul_hi_u32 s4, s8, s4 +; GFX9-NEXT: s_add_i32 s8, s8, s4 +; GFX9-NEXT: s_mul_hi_u32 s4, s5, s8 +; GFX9-NEXT: s_mul_i32 s4, s4, s2 +; GFX9-NEXT: s_sub_i32 s4, s5, s4 +; GFX9-NEXT: s_sub_i32 s5, s4, s2 +; GFX9-NEXT: s_cmp_ge_u32 s4, s2 +; GFX9-NEXT: s_cselect_b32 s4, s5, s4 +; GFX9-NEXT: s_sub_i32 s5, s4, s2 +; GFX9-NEXT: s_cmp_ge_u32 s4, s2 +; GFX9-NEXT: s_cselect_b32 s2, s5, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %shl.y = shl <2 x i32> , %y @@ -6115,8 +6110,8 @@ define amdgpu_kernel void @sdiv_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX6-LABEL: sdiv_i32_oddk_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v0, 0xd9528441 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -6131,17 +6126,17 @@ define amdgpu_kernel void @sdiv_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX9-LABEL: sdiv_i32_oddk_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_hi_i32 s2, s4, 0xd9528441 -; GFX9-NEXT: s_add_i32 s2, s2, s4 -; GFX9-NEXT: s_lshr_b32 s3, s2, 31 -; GFX9-NEXT: s_ashr_i32 s2, s2, 20 -; GFX9-NEXT: s_add_i32 s2, s2, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_mul_hi_i32 s0, s4, 0xd9528441 +; GFX9-NEXT: s_add_i32 s0, s0, s4 +; GFX9-NEXT: s_lshr_b32 s1, s0, 31 +; GFX9-NEXT: s_ashr_i32 s0, s0, 20 +; GFX9-NEXT: s_add_i32 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm %r = sdiv i32 %x, 1235195 store i32 %r, ptr addrspace(1) %out @@ -6156,8 +6151,8 @@ define amdgpu_kernel void @sdiv_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX6-LABEL: sdiv_i32_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6171,16 +6166,16 @@ define amdgpu_kernel void @sdiv_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX9-LABEL: sdiv_i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s2, s4, 31 -; GFX9-NEXT: s_lshr_b32 s2, s2, 20 -; GFX9-NEXT: s_add_i32 s4, s4, s2 -; GFX9-NEXT: s_ashr_i32 s2, s4, 12 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_ashr_i32 s0, s4, 31 +; GFX9-NEXT: s_lshr_b32 s0, s0, 20 +; GFX9-NEXT: s_add_i32 s4, s4, s0 +; GFX9-NEXT: s_ashr_i32 s0, s4, 12 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm %r = sdiv i32 %x, 4096 store i32 %r, ptr addrspace(1) %out @@ -6196,7 +6191,7 @@ define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; ; GFX6-LABEL: sdiv_i32_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6239,41 +6234,41 @@ define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; ; GFX9-LABEL: sdiv_i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3 -; GFX9-NEXT: s_ashr_i32 s4, s3, 31 -; GFX9-NEXT: s_add_i32 s3, s3, s4 -; GFX9-NEXT: s_xor_b32 s3, s3, s4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX9-NEXT: s_sub_i32 s6, 0, s3 -; GFX9-NEXT: s_ashr_i32 s5, s2, 31 -; GFX9-NEXT: s_add_i32 s2, s2, s5 +; GFX9-NEXT: s_lshl_b32 s0, 0x1000, s7 +; GFX9-NEXT: s_ashr_i32 s1, s0, 31 +; GFX9-NEXT: s_add_i32 s0, s0, s1 +; GFX9-NEXT: s_xor_b32 s0, s0, s1 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX9-NEXT: s_ashr_i32 s2, s6, 31 +; GFX9-NEXT: s_add_i32 s3, s6, s2 +; GFX9-NEXT: s_sub_i32 s6, 0, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_xor_b32 s2, s2, s5 +; GFX9-NEXT: s_xor_b32 s3, s3, s2 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s7, v0 ; GFX9-NEXT: s_mul_i32 s6, s6, s7 ; GFX9-NEXT: s_mul_hi_u32 s6, s7, s6 ; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_mul_hi_u32 s6, s2, s7 -; GFX9-NEXT: s_mul_i32 s8, s6, s3 -; GFX9-NEXT: s_sub_i32 s2, s2, s8 +; GFX9-NEXT: s_mul_hi_u32 s6, s3, s7 +; GFX9-NEXT: s_mul_i32 s8, s6, s0 +; GFX9-NEXT: s_sub_i32 s3, s3, s8 ; GFX9-NEXT: s_add_i32 s7, s6, 1 -; GFX9-NEXT: s_sub_i32 s8, s2, s3 -; GFX9-NEXT: s_cmp_ge_u32 s2, s3 +; GFX9-NEXT: s_sub_i32 s8, s3, s0 +; GFX9-NEXT: s_cmp_ge_u32 s3, s0 ; GFX9-NEXT: s_cselect_b32 s6, s7, s6 -; GFX9-NEXT: s_cselect_b32 s2, s8, s2 +; GFX9-NEXT: s_cselect_b32 s3, s8, s3 ; GFX9-NEXT: s_add_i32 s7, s6, 1 -; GFX9-NEXT: s_cmp_ge_u32 s2, s3 -; GFX9-NEXT: s_cselect_b32 s2, s7, s6 -; GFX9-NEXT: s_xor_b32 s3, s5, s4 -; GFX9-NEXT: s_xor_b32 s2, s2, s3 -; GFX9-NEXT: s_sub_i32 s2, s2, s3 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: s_cmp_ge_u32 s3, s0 +; GFX9-NEXT: s_cselect_b32 s0, s7, s6 +; GFX9-NEXT: s_xor_b32 s1, s2, s1 +; GFX9-NEXT: s_xor_b32 s0, s0, s1 +; GFX9-NEXT: s_sub_i32 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-NEXT: s_endpgm %shl.y = shl i32 4096, %y %r = sdiv i32 %x, %shl.y @@ -6294,7 +6289,7 @@ define amdgpu_kernel void @sdiv_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3 ; ; GFX6-LABEL: sdiv_v2i32_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6315,20 +6310,20 @@ define amdgpu_kernel void @sdiv_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3 ; ; GFX9-LABEL: sdiv_v2i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s4, s2, 31 -; GFX9-NEXT: s_ashr_i32 s5, s3, 31 -; GFX9-NEXT: s_lshr_b32 s4, s4, 20 -; GFX9-NEXT: s_lshr_b32 s5, s5, 20 -; GFX9-NEXT: s_add_i32 s2, s2, s4 -; GFX9-NEXT: s_add_i32 s3, s3, s5 -; GFX9-NEXT: s_ashr_i32 s2, s2, 12 -; GFX9-NEXT: s_ashr_i32 s3, s3, 12 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_ashr_i32 s0, s6, 31 +; GFX9-NEXT: s_ashr_i32 s1, s7, 31 +; GFX9-NEXT: s_lshr_b32 s0, s0, 20 +; GFX9-NEXT: s_lshr_b32 s1, s1, 20 +; GFX9-NEXT: s_add_i32 s0, s6, s0 +; GFX9-NEXT: s_add_i32 s1, s7, s1 +; GFX9-NEXT: s_ashr_i32 s0, s0, 12 +; GFX9-NEXT: s_ashr_i32 s1, s1, 12 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm %r = sdiv <2 x i32> %x, store <2 x i32> %r, ptr addrspace(1) %out @@ -6348,7 +6343,7 @@ define amdgpu_kernel void @ssdiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out, ; ; GFX6-LABEL: ssdiv_v2i32_mixed_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v0, 0x80080081 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 @@ -6370,21 +6365,21 @@ define amdgpu_kernel void @ssdiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out, ; ; GFX9-LABEL: ssdiv_v2i32_mixed_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s4, s2, 31 -; GFX9-NEXT: s_mul_hi_i32 s5, s3, 0x80080081 -; GFX9-NEXT: s_lshr_b32 s4, s4, 20 -; GFX9-NEXT: s_add_i32 s5, s5, s3 -; GFX9-NEXT: s_add_i32 s2, s2, s4 -; GFX9-NEXT: s_lshr_b32 s3, s5, 31 -; GFX9-NEXT: s_ashr_i32 s4, s5, 11 -; GFX9-NEXT: s_ashr_i32 s2, s2, 12 -; GFX9-NEXT: s_add_i32 s4, s4, s3 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_ashr_i32 s0, s6, 31 +; GFX9-NEXT: s_mul_hi_i32 s1, s7, 0x80080081 +; GFX9-NEXT: s_lshr_b32 s0, s0, 20 +; GFX9-NEXT: s_add_i32 s1, s1, s7 +; GFX9-NEXT: s_add_i32 s0, s6, s0 +; GFX9-NEXT: s_lshr_b32 s2, s1, 31 +; GFX9-NEXT: s_ashr_i32 s1, s1, 11 +; GFX9-NEXT: s_ashr_i32 s0, s0, 12 +; GFX9-NEXT: s_add_i32 s1, s1, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm %r = sdiv <2 x i32> %x, store <2 x i32> %r, ptr addrspace(1) %out @@ -6481,50 +6476,50 @@ define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX6-LABEL: sdiv_v2i32_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s0, 0x1000, s6 -; GFX6-NEXT: s_abs_i32 s1, s0 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s1 -; GFX6-NEXT: s_sub_i32 s6, 0, s1 -; GFX6-NEXT: s_xor_b32 s0, s4, s0 +; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s6 +; GFX6-NEXT: s_abs_i32 s3, s2 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX6-NEXT: s_sub_i32 s6, 0, s3 +; GFX6-NEXT: s_xor_b32 s2, s4, s2 ; GFX6-NEXT: s_lshl_b32 s7, 0x1000, s7 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_lo_u32 v1, s6, v0 ; GFX6-NEXT: s_abs_i32 s6, s4 -; GFX6-NEXT: s_ashr_i32 s4, s0, 31 +; GFX6-NEXT: s_ashr_i32 s4, s2, 31 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: s_mul_i32 s0, s0, s1 -; GFX6-NEXT: s_sub_i32 s0, s6, s0 -; GFX6-NEXT: s_sub_i32 s6, s0, s1 +; GFX6-NEXT: v_readfirstlane_b32 s2, v0 +; GFX6-NEXT: s_mul_i32 s2, s2, s3 +; GFX6-NEXT: s_sub_i32 s2, s6, s2 +; GFX6-NEXT: s_sub_i32 s6, s2, s3 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 -; GFX6-NEXT: s_cmp_ge_u32 s0, s1 +; GFX6-NEXT: s_cmp_ge_u32 s2, s3 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX6-NEXT: s_cselect_b32 s0, s6, s0 +; GFX6-NEXT: s_cselect_b32 s2, s6, s2 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 -; GFX6-NEXT: s_cmp_ge_u32 s0, s1 +; GFX6-NEXT: s_cmp_ge_u32 s2, s3 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX6-NEXT: s_abs_i32 s6, s7 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s6 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_sub_i32 s2, 0, s6 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: s_xor_b32 s7, s5, s7 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: s_abs_i32 s5, s5 ; GFX6-NEXT: v_xor_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s4, v0 ; GFX6-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s4, v0 ; GFX6-NEXT: s_ashr_i32 s7, s7, 31 -; GFX6-NEXT: v_mul_lo_u32 v3, s2, v2 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: v_mul_lo_u32 v3, s2, v2 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mul_hi_u32 v1, v2, v3 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 @@ -6544,73 +6539,71 @@ define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX6-NEXT: v_xor_b32_e32 v1, s7, v1 ; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s7, v1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_v2i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s0, 0x1000, s6 -; GFX9-NEXT: s_abs_i32 s1, s0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 -; GFX9-NEXT: s_xor_b32 s0, s4, s0 +; GFX9-NEXT: s_lshl_b32 s2, 0x1000, s6 +; GFX9-NEXT: s_abs_i32 s3, s2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 ; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s7 ; GFX9-NEXT: s_abs_i32 s7, s4 +; GFX9-NEXT: s_xor_b32 s2, s4, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_ashr_i32 s4, s0, 31 -; GFX9-NEXT: s_sub_i32 s0, 0, s1 +; GFX9-NEXT: s_sub_i32 s4, 0, s3 +; GFX9-NEXT: s_ashr_i32 s2, s2, 31 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s8, v0 -; GFX9-NEXT: s_mul_i32 s0, s0, s8 -; GFX9-NEXT: s_mul_hi_u32 s0, s8, s0 -; GFX9-NEXT: s_add_i32 s8, s8, s0 -; GFX9-NEXT: s_mul_hi_u32 s0, s7, s8 -; GFX9-NEXT: s_mul_i32 s8, s0, s1 +; GFX9-NEXT: s_mul_i32 s4, s4, s8 +; GFX9-NEXT: s_mul_hi_u32 s4, s8, s4 +; GFX9-NEXT: s_add_i32 s8, s8, s4 +; GFX9-NEXT: s_mul_hi_u32 s4, s7, s8 +; GFX9-NEXT: s_mul_i32 s8, s4, s3 ; GFX9-NEXT: s_sub_i32 s7, s7, s8 -; GFX9-NEXT: s_add_i32 s9, s0, 1 -; GFX9-NEXT: s_sub_i32 s8, s7, s1 -; GFX9-NEXT: s_cmp_ge_u32 s7, s1 -; GFX9-NEXT: s_cselect_b32 s0, s9, s0 +; GFX9-NEXT: s_add_i32 s9, s4, 1 +; GFX9-NEXT: s_sub_i32 s8, s7, s3 +; GFX9-NEXT: s_cmp_ge_u32 s7, s3 +; GFX9-NEXT: s_cselect_b32 s4, s9, s4 ; GFX9-NEXT: s_cselect_b32 s7, s8, s7 -; GFX9-NEXT: s_add_i32 s8, s0, 1 -; GFX9-NEXT: s_cmp_ge_u32 s7, s1 -; GFX9-NEXT: s_cselect_b32 s7, s8, s0 -; GFX9-NEXT: s_abs_i32 s8, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_xor_b32 s2, s5, s6 -; GFX9-NEXT: s_abs_i32 s3, s5 +; GFX9-NEXT: s_add_i32 s8, s4, 1 +; GFX9-NEXT: s_cmp_ge_u32 s7, s3 +; GFX9-NEXT: s_cselect_b32 s3, s8, s4 +; GFX9-NEXT: s_abs_i32 s4, s6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX9-NEXT: s_xor_b32 s3, s3, s2 +; GFX9-NEXT: s_sub_i32 s7, 0, s4 +; GFX9-NEXT: s_sub_i32 s2, s3, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_xor_b32 s5, s7, s4 -; GFX9-NEXT: s_sub_i32 s6, 0, s8 -; GFX9-NEXT: s_sub_i32 s4, s5, s4 +; GFX9-NEXT: s_xor_b32 s6, s5, s6 +; GFX9-NEXT: s_abs_i32 s5, s5 +; GFX9-NEXT: s_ashr_i32 s6, s6, 31 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_ashr_i32 s2, s2, 31 -; GFX9-NEXT: v_readfirstlane_b32 s5, v0 -; GFX9-NEXT: s_mul_i32 s6, s6, s5 -; GFX9-NEXT: s_mul_hi_u32 s6, s5, s6 -; GFX9-NEXT: s_add_i32 s5, s5, s6 -; GFX9-NEXT: s_mul_hi_u32 s5, s3, s5 -; GFX9-NEXT: s_mul_i32 s6, s5, s8 -; GFX9-NEXT: s_sub_i32 s3, s3, s6 -; GFX9-NEXT: s_add_i32 s7, s5, 1 -; GFX9-NEXT: s_sub_i32 s6, s3, s8 -; GFX9-NEXT: s_cmp_ge_u32 s3, s8 +; GFX9-NEXT: v_readfirstlane_b32 s3, v0 +; GFX9-NEXT: s_mul_i32 s7, s7, s3 +; GFX9-NEXT: s_mul_hi_u32 s7, s3, s7 +; GFX9-NEXT: s_add_i32 s3, s3, s7 +; GFX9-NEXT: s_mul_hi_u32 s3, s5, s3 +; GFX9-NEXT: s_mul_i32 s7, s3, s4 +; GFX9-NEXT: s_sub_i32 s5, s5, s7 +; GFX9-NEXT: s_add_i32 s8, s3, 1 +; GFX9-NEXT: s_sub_i32 s7, s5, s4 +; GFX9-NEXT: s_cmp_ge_u32 s5, s4 +; GFX9-NEXT: s_cselect_b32 s3, s8, s3 ; GFX9-NEXT: s_cselect_b32 s5, s7, s5 -; GFX9-NEXT: s_cselect_b32 s3, s6, s3 -; GFX9-NEXT: s_add_i32 s6, s5, 1 -; GFX9-NEXT: s_cmp_ge_u32 s3, s8 -; GFX9-NEXT: s_cselect_b32 s3, s6, s5 -; GFX9-NEXT: s_xor_b32 s3, s3, s2 -; GFX9-NEXT: s_sub_i32 s2, s3, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_add_i32 s7, s3, 1 +; GFX9-NEXT: s_cmp_ge_u32 s5, s4 +; GFX9-NEXT: s_cselect_b32 s3, s7, s3 +; GFX9-NEXT: s_xor_b32 s3, s3, s6 +; GFX9-NEXT: s_sub_i32 s3, s3, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %shl.y = shl <2 x i32> , %y @@ -6627,9 +6620,9 @@ define amdgpu_kernel void @srem_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX6-LABEL: srem_i32_oddk_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb ; GFX6-NEXT: v_mov_b32_e32 v0, 0xd9528441 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6647,19 +6640,19 @@ define amdgpu_kernel void @srem_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX9-LABEL: srem_i32_oddk_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_hi_i32 s2, s4, 0xd9528441 -; GFX9-NEXT: s_add_i32 s2, s2, s4 -; GFX9-NEXT: s_lshr_b32 s3, s2, 31 -; GFX9-NEXT: s_ashr_i32 s2, s2, 20 -; GFX9-NEXT: s_add_i32 s2, s2, s3 -; GFX9-NEXT: s_mul_i32 s2, s2, 0x12d8fb -; GFX9-NEXT: s_sub_i32 s2, s4, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_mul_hi_i32 s0, s4, 0xd9528441 +; GFX9-NEXT: s_add_i32 s0, s0, s4 +; GFX9-NEXT: s_lshr_b32 s1, s0, 31 +; GFX9-NEXT: s_ashr_i32 s0, s0, 20 +; GFX9-NEXT: s_add_i32 s0, s0, s1 +; GFX9-NEXT: s_mul_i32 s0, s0, 0x12d8fb +; GFX9-NEXT: s_sub_i32 s0, s4, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm %r = srem i32 %x, 1235195 store i32 %r, ptr addrspace(1) %out @@ -6674,8 +6667,8 @@ define amdgpu_kernel void @srem_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX6-LABEL: srem_i32_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6690,17 +6683,17 @@ define amdgpu_kernel void @srem_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX9-LABEL: srem_i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s2, s4, 31 -; GFX9-NEXT: s_lshr_b32 s2, s2, 20 -; GFX9-NEXT: s_add_i32 s2, s4, s2 -; GFX9-NEXT: s_and_b32 s2, s2, 0xfffff000 -; GFX9-NEXT: s_sub_i32 s2, s4, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_ashr_i32 s0, s4, 31 +; GFX9-NEXT: s_lshr_b32 s0, s0, 20 +; GFX9-NEXT: s_add_i32 s0, s4, s0 +; GFX9-NEXT: s_and_b32 s0, s0, 0xfffff000 +; GFX9-NEXT: s_sub_i32 s0, s4, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm %r = srem i32 %x, 4096 store i32 %r, ptr addrspace(1) %out @@ -6716,7 +6709,7 @@ define amdgpu_kernel void @srem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; ; GFX6-LABEL: srem_i32_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s3 ; GFX6-NEXT: s_ashr_i32 s4, s3, 31 @@ -6753,38 +6746,38 @@ define amdgpu_kernel void @srem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; ; GFX9-LABEL: srem_i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3 -; GFX9-NEXT: s_ashr_i32 s4, s3, 31 -; GFX9-NEXT: s_add_i32 s3, s3, s4 -; GFX9-NEXT: s_xor_b32 s3, s3, s4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX9-NEXT: s_sub_i32 s5, 0, s3 -; GFX9-NEXT: s_ashr_i32 s4, s2, 31 -; GFX9-NEXT: s_add_i32 s2, s2, s4 +; GFX9-NEXT: s_lshl_b32 s0, 0x1000, s7 +; GFX9-NEXT: s_ashr_i32 s1, s0, 31 +; GFX9-NEXT: s_add_i32 s0, s0, s1 +; GFX9-NEXT: s_xor_b32 s0, s0, s1 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX9-NEXT: s_ashr_i32 s1, s6, 31 +; GFX9-NEXT: s_add_i32 s2, s6, s1 +; GFX9-NEXT: s_sub_i32 s3, 0, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_xor_b32 s2, s2, s4 +; GFX9-NEXT: s_xor_b32 s2, s2, s1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s6, v0 -; GFX9-NEXT: s_mul_i32 s5, s5, s6 -; GFX9-NEXT: s_mul_hi_u32 s5, s6, s5 -; GFX9-NEXT: s_add_i32 s6, s6, s5 -; GFX9-NEXT: s_mul_hi_u32 s5, s2, s6 -; GFX9-NEXT: s_mul_i32 s5, s5, s3 -; GFX9-NEXT: s_sub_i32 s2, s2, s5 -; GFX9-NEXT: s_sub_i32 s5, s2, s3 -; GFX9-NEXT: s_cmp_ge_u32 s2, s3 -; GFX9-NEXT: s_cselect_b32 s2, s5, s2 -; GFX9-NEXT: s_sub_i32 s5, s2, s3 -; GFX9-NEXT: s_cmp_ge_u32 s2, s3 -; GFX9-NEXT: s_cselect_b32 s2, s5, s2 -; GFX9-NEXT: s_xor_b32 s2, s2, s4 -; GFX9-NEXT: s_sub_i32 s2, s2, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: s_mul_i32 s3, s3, s6 +; GFX9-NEXT: s_mul_hi_u32 s3, s6, s3 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_mul_hi_u32 s3, s2, s6 +; GFX9-NEXT: s_mul_i32 s3, s3, s0 +; GFX9-NEXT: s_sub_i32 s2, s2, s3 +; GFX9-NEXT: s_sub_i32 s3, s2, s0 +; GFX9-NEXT: s_cmp_ge_u32 s2, s0 +; GFX9-NEXT: s_cselect_b32 s2, s3, s2 +; GFX9-NEXT: s_sub_i32 s3, s2, s0 +; GFX9-NEXT: s_cmp_ge_u32 s2, s0 +; GFX9-NEXT: s_cselect_b32 s0, s3, s2 +; GFX9-NEXT: s_xor_b32 s0, s0, s1 +; GFX9-NEXT: s_sub_i32 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-NEXT: s_endpgm %shl.y = shl i32 4096, %y %r = srem i32 %x, %shl.y @@ -6805,7 +6798,7 @@ define amdgpu_kernel void @srem_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3 ; ; GFX6-LABEL: srem_v2i32_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6828,22 +6821,22 @@ define amdgpu_kernel void @srem_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3 ; ; GFX9-LABEL: srem_v2i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s4, s2, 31 -; GFX9-NEXT: s_ashr_i32 s5, s3, 31 -; GFX9-NEXT: s_lshr_b32 s4, s4, 20 -; GFX9-NEXT: s_lshr_b32 s5, s5, 20 -; GFX9-NEXT: s_add_i32 s4, s2, s4 -; GFX9-NEXT: s_add_i32 s5, s3, s5 -; GFX9-NEXT: s_and_b32 s4, s4, 0xfffff000 -; GFX9-NEXT: s_sub_i32 s2, s2, s4 -; GFX9-NEXT: s_and_b32 s4, s5, 0xfffff000 -; GFX9-NEXT: s_sub_i32 s3, s3, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_ashr_i32 s0, s6, 31 +; GFX9-NEXT: s_ashr_i32 s1, s7, 31 +; GFX9-NEXT: s_lshr_b32 s0, s0, 20 +; GFX9-NEXT: s_lshr_b32 s1, s1, 20 +; GFX9-NEXT: s_add_i32 s0, s6, s0 +; GFX9-NEXT: s_add_i32 s1, s7, s1 +; GFX9-NEXT: s_and_b32 s0, s0, 0xfffff000 +; GFX9-NEXT: s_and_b32 s1, s1, 0xfffff000 +; GFX9-NEXT: s_sub_i32 s0, s6, s0 +; GFX9-NEXT: s_sub_i32 s1, s7, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm %r = srem <2 x i32> %x, store <2 x i32> %r, ptr addrspace(1) %out @@ -6934,44 +6927,44 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX6-LABEL: srem_v2i32_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s0, 0x1000, s6 -; GFX6-NEXT: s_abs_i32 s0, s0 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GFX6-NEXT: s_sub_i32 s1, 0, s0 +; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s6 +; GFX6-NEXT: s_abs_i32 s2, s2 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX6-NEXT: s_sub_i32 s3, 0, s2 ; GFX6-NEXT: s_lshl_b32 s6, 0x1000, s7 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX6-NEXT: s_abs_i32 s1, s4 +; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0 +; GFX6-NEXT: s_abs_i32 s3, s4 ; GFX6-NEXT: s_ashr_i32 s4, s4, 31 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 ; GFX6-NEXT: v_readfirstlane_b32 s7, v0 -; GFX6-NEXT: s_mul_i32 s7, s7, s0 -; GFX6-NEXT: s_sub_i32 s1, s1, s7 -; GFX6-NEXT: s_sub_i32 s7, s1, s0 -; GFX6-NEXT: s_cmp_ge_u32 s1, s0 -; GFX6-NEXT: s_cselect_b32 s1, s7, s1 -; GFX6-NEXT: s_sub_i32 s7, s1, s0 -; GFX6-NEXT: s_cmp_ge_u32 s1, s0 -; GFX6-NEXT: s_cselect_b32 s7, s7, s1 +; GFX6-NEXT: s_mul_i32 s7, s7, s2 +; GFX6-NEXT: s_sub_i32 s3, s3, s7 +; GFX6-NEXT: s_sub_i32 s7, s3, s2 +; GFX6-NEXT: s_cmp_ge_u32 s3, s2 +; GFX6-NEXT: s_cselect_b32 s3, s7, s3 +; GFX6-NEXT: s_sub_i32 s7, s3, s2 +; GFX6-NEXT: s_cmp_ge_u32 s3, s2 +; GFX6-NEXT: s_cselect_b32 s7, s7, s3 ; GFX6-NEXT: s_abs_i32 s6, s6 ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX6-NEXT: s_sub_i32 s0, 0, s6 +; GFX6-NEXT: s_sub_i32 s2, 0, s6 ; GFX6-NEXT: s_abs_i32 s8, s5 ; GFX6-NEXT: s_xor_b32 s7, s7, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: s_sub_i32 s4, s7, s4 ; GFX6-NEXT: s_ashr_i32 s5, s5, 31 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: v_mul_lo_u32 v1, s2, v0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 @@ -6989,20 +6982,20 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_sub_i32 s5, s6, s5 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: srem_v2i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s0, 0x1000, s6 -; GFX9-NEXT: s_abs_i32 s0, s0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GFX9-NEXT: s_lshl_b32 s1, 0x1000, s7 -; GFX9-NEXT: s_sub_i32 s7, 0, s0 +; GFX9-NEXT: s_lshl_b32 s2, 0x1000, s6 +; GFX9-NEXT: s_abs_i32 s2, s2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s7 +; GFX9-NEXT: s_sub_i32 s7, 0, s2 ; GFX9-NEXT: s_ashr_i32 s6, s4, 31 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_abs_i32 s4, s4 @@ -7013,43 +7006,41 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_mul_hi_u32 s7, s8, s7 ; GFX9-NEXT: s_add_i32 s8, s8, s7 ; GFX9-NEXT: s_mul_hi_u32 s7, s4, s8 -; GFX9-NEXT: s_mul_i32 s7, s7, s0 +; GFX9-NEXT: s_mul_i32 s7, s7, s2 ; GFX9-NEXT: s_sub_i32 s4, s4, s7 -; GFX9-NEXT: s_sub_i32 s7, s4, s0 -; GFX9-NEXT: s_cmp_ge_u32 s4, s0 +; GFX9-NEXT: s_sub_i32 s7, s4, s2 +; GFX9-NEXT: s_cmp_ge_u32 s4, s2 ; GFX9-NEXT: s_cselect_b32 s4, s7, s4 -; GFX9-NEXT: s_sub_i32 s7, s4, s0 -; GFX9-NEXT: s_cmp_ge_u32 s4, s0 -; GFX9-NEXT: s_cselect_b32 s4, s7, s4 -; GFX9-NEXT: s_abs_i32 s7, s1 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX9-NEXT: s_xor_b32 s4, s4, s6 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_ashr_i32 s2, s5, 31 +; GFX9-NEXT: s_sub_i32 s7, s4, s2 +; GFX9-NEXT: s_cmp_ge_u32 s4, s2 +; GFX9-NEXT: s_cselect_b32 s2, s7, s4 +; GFX9-NEXT: s_abs_i32 s3, s3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-NEXT: s_xor_b32 s2, s2, s6 +; GFX9-NEXT: s_sub_i32 s7, 0, s3 +; GFX9-NEXT: s_sub_i32 s2, s2, s6 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_abs_i32 s3, s5 -; GFX9-NEXT: s_sub_i32 s5, 0, s7 -; GFX9-NEXT: s_sub_i32 s4, s4, s6 +; GFX9-NEXT: s_ashr_i32 s4, s5, 31 +; GFX9-NEXT: s_abs_i32 s5, s5 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s6, v0 -; GFX9-NEXT: s_mul_i32 s5, s5, s6 -; GFX9-NEXT: s_mul_hi_u32 s5, s6, s5 -; GFX9-NEXT: s_add_i32 s6, s6, s5 -; GFX9-NEXT: s_mul_hi_u32 s5, s3, s6 -; GFX9-NEXT: s_mul_i32 s5, s5, s7 -; GFX9-NEXT: s_sub_i32 s3, s3, s5 -; GFX9-NEXT: s_sub_i32 s5, s3, s7 -; GFX9-NEXT: s_cmp_ge_u32 s3, s7 -; GFX9-NEXT: s_cselect_b32 s3, s5, s3 -; GFX9-NEXT: s_sub_i32 s5, s3, s7 -; GFX9-NEXT: s_cmp_ge_u32 s3, s7 -; GFX9-NEXT: s_cselect_b32 s3, s5, s3 -; GFX9-NEXT: s_xor_b32 s3, s3, s2 -; GFX9-NEXT: s_sub_i32 s2, s3, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mul_i32 s7, s7, s6 +; GFX9-NEXT: s_mul_hi_u32 s7, s6, s7 +; GFX9-NEXT: s_add_i32 s6, s6, s7 +; GFX9-NEXT: s_mul_hi_u32 s6, s5, s6 +; GFX9-NEXT: s_mul_i32 s6, s6, s3 +; GFX9-NEXT: s_sub_i32 s5, s5, s6 +; GFX9-NEXT: s_sub_i32 s6, s5, s3 +; GFX9-NEXT: s_cmp_ge_u32 s5, s3 +; GFX9-NEXT: s_cselect_b32 s5, s6, s5 +; GFX9-NEXT: s_sub_i32 s6, s5, s3 +; GFX9-NEXT: s_cmp_ge_u32 s5, s3 +; GFX9-NEXT: s_cselect_b32 s3, s6, s5 +; GFX9-NEXT: s_xor_b32 s3, s3, s4 +; GFX9-NEXT: s_sub_i32 s3, s3, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %shl.y = shl <2 x i32> , %y @@ -7078,7 +7069,7 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { ; GFX6-NEXT: v_mul_hi_u32 v2, v0, s4 ; GFX6-NEXT: s_addc_u32 s5, s5, 0 ; GFX6-NEXT: s_mul_i32 s6, s5, 0x68958c89 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GFX6-NEXT: v_mul_lo_u32 v2, v0, s4 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, s6, v1 @@ -7163,11 +7154,11 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX9-LABEL: udiv_i64_oddk_denom: ; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_add_u32 s0, 3, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0xe3e0f6 ; GFX9-NEXT: s_addc_u32 s1, 0, 0 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: s_addc_u32 s0, s1, 0 @@ -7276,7 +7267,7 @@ define amdgpu_kernel void @udiv_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX6-LABEL: udiv_i64_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7290,7 +7281,7 @@ define amdgpu_kernel void @udiv_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX9-LABEL: udiv_i64_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], 12 @@ -7312,8 +7303,8 @@ define amdgpu_kernel void @udiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; ; GFX6-LABEL: udiv_i64_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX6-NEXT: s_load_dword s8, s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s8, s[0:1], 0xd ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7328,12 +7319,12 @@ define amdgpu_kernel void @udiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; ; GFX9-LABEL: udiv_i64_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_i32 s0, s0, 12 -; GFX9-NEXT: s_lshr_b64 s[0:1], s[6:7], s0 +; GFX9-NEXT: s_add_i32 s2, s2, 12 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[6:7], s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] @@ -7357,8 +7348,8 @@ define amdgpu_kernel void @udiv_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i6 ; ; GFX6-LABEL: udiv_v2i64_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7373,17 +7364,17 @@ define amdgpu_kernel void @udiv_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i6 ; ; GFX9-LABEL: udiv_v2i64_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b64 s[2:3], s[4:5], 12 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[4:5], 12 ; GFX9-NEXT: s_lshr_b64 s[4:5], s[6:7], 12 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX9-NEXT: s_endpgm %r = udiv <2 x i64> %x, store <2 x i64> %r, ptr addrspace(1) %out @@ -7403,8 +7394,8 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(ptr addrspace(1) %out, < ; ; GFX6-LABEL: udiv_v2i64_mixed_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s2, 0x2ff2fc01 ; GFX6-NEXT: v_bfrev_b32_e32 v0, 7 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7490,13 +7481,13 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(ptr addrspace(1) %out, < ; ; GFX9-LABEL: udiv_v2i64_mixed_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s8, 0x2ff2fc01 ; GFX9-NEXT: v_bfrev_b32_e32 v0, 7 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b64 s[2:3], s[4:5], 12 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[4:5], 12 ; GFX9-NEXT: s_add_u32 s4, 0xe037f, s8 ; GFX9-NEXT: s_addc_u32 s5, 0, 0 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v0 @@ -7579,9 +7570,9 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(ptr addrspace(1) %out, < ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX9-NEXT: s_endpgm %r = udiv <2 x i64> %x, store <2 x i64> %r, ptr addrspace(1) %out @@ -7604,27 +7595,27 @@ define amdgpu_kernel void @udiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX6-LABEL: udiv_v2i64_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 -; GFX6-NEXT: s_mov_b32 s15, 0xf000 -; GFX6-NEXT: s_mov_b32 s14, -1 +; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_add_i32 s0, s8, 12 -; GFX6-NEXT: s_add_i32 s2, s10, 12 -; GFX6-NEXT: s_lshr_b64 s[0:1], s[4:5], s0 -; GFX6-NEXT: s_lshr_b64 s[2:3], s[6:7], s2 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: v_mov_b32_e32 v2, s2 -; GFX6-NEXT: v_mov_b32_e32 v3, s3 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; GFX6-NEXT: s_add_i32 s8, s8, 12 +; GFX6-NEXT: s_add_i32 s9, s10, 12 +; GFX6-NEXT: s_lshr_b64 s[4:5], s[4:5], s8 +; GFX6-NEXT: s_lshr_b64 s[6:7], s[6:7], s9 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v3, s7 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: udiv_v2i64_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_i32 s2, s8, 12 ; GFX9-NEXT: s_add_i32 s8, s10, 12 @@ -7650,12 +7641,12 @@ define amdgpu_kernel void @urem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX6-LABEL: urem_i64_oddk_denom: ; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX6-NEXT: s_add_u32 s0, 4, 0 ; GFX6-NEXT: v_mov_b32_e32 v0, 0xe3e0fc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 ; GFX6-NEXT: s_addc_u32 s1, 0, 0 ; GFX6-NEXT: s_or_b32 s0, vcc_lo, vcc_hi -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 ; GFX6-NEXT: s_cmp_lg_u32 s0, 0 ; GFX6-NEXT: s_mov_b32 s0, 0x689e0837 ; GFX6-NEXT: s_movk_i32 s2, 0xfee0 @@ -7746,11 +7737,11 @@ define amdgpu_kernel void @urem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX9-LABEL: urem_i64_oddk_denom: ; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_add_u32 s0, 4, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0xe3e0fc ; GFX9-NEXT: s_addc_u32 s1, 0, 0 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: s_addc_u32 s0, s1, 0 @@ -7857,7 +7848,7 @@ define amdgpu_kernel void @urem_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX6-LABEL: urem_i64_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: v_mov_b32_e32 v1, 0 @@ -7871,7 +7862,7 @@ define amdgpu_kernel void @urem_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX9-LABEL: urem_i64_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0xfff @@ -7892,8 +7883,8 @@ define amdgpu_kernel void @urem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; ; GFX6-LABEL: urem_i64_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX6-NEXT: s_load_dword s8, s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s8, s[0:1], 0xd ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7910,11 +7901,11 @@ define amdgpu_kernel void @urem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; ; GFX9-LABEL: urem_i64_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s0 +; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s2 ; GFX9-NEXT: s_add_u32 s0, s0, -1 ; GFX9-NEXT: s_addc_u32 s1, s1, -1 ; GFX9-NEXT: s_and_b64 s[0:1], s[6:7], s[0:1] @@ -7941,8 +7932,8 @@ define amdgpu_kernel void @urem_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i6 ; ; GFX6-LABEL: urem_v2i64_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v1, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -7957,16 +7948,16 @@ define amdgpu_kernel void @urem_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i6 ; ; GFX9-LABEL: urem_v2i64_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s2, s4, 0xfff -; GFX9-NEXT: s_and_b32 s3, s6, 0xfff -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] +; GFX9-NEXT: s_and_b32 s0, s4, 0xfff +; GFX9-NEXT: s_and_b32 s1, s6, 0xfff +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: global_store_dwordx4 v1, v[0:3], s[2:3] ; GFX9-NEXT: s_endpgm %r = urem <2 x i64> %x, store <2 x i64> %r, ptr addrspace(1) %out @@ -7989,31 +7980,31 @@ define amdgpu_kernel void @urem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX6-LABEL: urem_v2i64_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 -; GFX6-NEXT: s_mov_b32 s15, 0xf000 -; GFX6-NEXT: s_mov_b32 s14, -1 +; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b64 s[0:1], 0x1000, s10 -; GFX6-NEXT: s_lshl_b64 s[2:3], 0x1000, s8 -; GFX6-NEXT: s_add_u32 s2, s2, -1 -; GFX6-NEXT: s_addc_u32 s3, s3, -1 -; GFX6-NEXT: s_and_b64 s[2:3], s[4:5], s[2:3] -; GFX6-NEXT: s_add_u32 s0, s0, -1 -; GFX6-NEXT: s_addc_u32 s1, s1, -1 -; GFX6-NEXT: s_and_b64 s[0:1], s[6:7], s[0:1] -; GFX6-NEXT: v_mov_b32_e32 v0, s2 -; GFX6-NEXT: v_mov_b32_e32 v1, s3 -; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: v_mov_b32_e32 v3, s1 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; GFX6-NEXT: s_lshl_b64 s[10:11], 0x1000, s10 +; GFX6-NEXT: s_lshl_b64 s[8:9], 0x1000, s8 +; GFX6-NEXT: s_add_u32 s8, s8, -1 +; GFX6-NEXT: s_addc_u32 s9, s9, -1 +; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], s[8:9] +; GFX6-NEXT: s_add_u32 s8, s10, -1 +; GFX6-NEXT: s_addc_u32 s9, s11, -1 +; GFX6-NEXT: s_and_b64 s[6:7], s[6:7], s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v3, s7 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: urem_v2i64_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[2:3], 0x1000, s10 ; GFX9-NEXT: s_lshl_b64 s[8:9], 0x1000, s8 @@ -8043,7 +8034,7 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX6-LABEL: sdiv_i64_oddk_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s0, 0x33fe64 ; GFX6-NEXT: s_add_u32 s1, 0x396, s0 ; GFX6-NEXT: v_mov_b32_e32 v0, 0x28100000 @@ -8159,7 +8150,7 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { ; GFX9-NEXT: s_mul_i32 s10, s4, s8 ; GFX9-NEXT: s_addc_u32 s8, 0, s11 ; GFX9-NEXT: s_add_u32 s6, s6, s10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mul_hi_u32 s7, s4, s5 ; GFX9-NEXT: s_addc_u32 s6, s8, s9 ; GFX9-NEXT: s_addc_u32 s7, s7, 0 @@ -8243,7 +8234,7 @@ define amdgpu_kernel void @sdiv_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX6-LABEL: sdiv_i64_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8261,7 +8252,7 @@ define amdgpu_kernel void @sdiv_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX9-LABEL: sdiv_i64_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s4, s3, 31 @@ -8287,21 +8278,21 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; ; GFX6-LABEL: sdiv_i64_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s0, s[2:3], 0xd +; GFX6-NEXT: s_load_dword s2, s[0:1], 0xd ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b64 s[0:1], 0x1000, s0 -; GFX6-NEXT: s_ashr_i32 s8, s1, 31 -; GFX6-NEXT: s_add_u32 s0, s0, s8 +; GFX6-NEXT: s_lshl_b64 s[2:3], 0x1000, s2 +; GFX6-NEXT: s_ashr_i32 s8, s3, 31 +; GFX6-NEXT: s_add_u32 s2, s2, s8 ; GFX6-NEXT: s_mov_b32 s9, s8 -; GFX6-NEXT: s_addc_u32 s1, s1, s8 -; GFX6-NEXT: s_xor_b64 s[10:11], s[0:1], s[8:9] +; GFX6-NEXT: s_addc_u32 s3, s3, s8 +; GFX6-NEXT: s_xor_b64 s[10:11], s[2:3], s[8:9] ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s10 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s11 ; GFX6-NEXT: s_sub_u32 s4, 0, s10 ; GFX6-NEXT: s_subb_u32 s5, 0, s11 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8423,19 +8414,19 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; ; GFX9-LABEL: sdiv_i64_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s0 -; GFX9-NEXT: s_ashr_i32 s8, s1, 31 -; GFX9-NEXT: s_add_u32 s0, s0, s8 -; GFX9-NEXT: s_mov_b32 s9, s8 -; GFX9-NEXT: s_addc_u32 s1, s1, s8 -; GFX9-NEXT: s_xor_b64 s[10:11], s[0:1], s[8:9] -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s10 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s11 -; GFX9-NEXT: s_sub_u32 s0, 0, s10 -; GFX9-NEXT: s_subb_u32 s1, 0, s11 +; GFX9-NEXT: s_lshl_b64 s[4:5], 0x1000, s2 +; GFX9-NEXT: s_ashr_i32 s2, s5, 31 +; GFX9-NEXT: s_add_u32 s4, s4, s2 +; GFX9-NEXT: s_mov_b32 s3, s2 +; GFX9-NEXT: s_addc_u32 s5, s5, s2 +; GFX9-NEXT: s_xor_b64 s[8:9], s[4:5], s[2:3] +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_sub_u32 s0, 0, s8 +; GFX9-NEXT: s_subb_u32 s1, 0, s9 ; GFX9-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GFX9-NEXT: v_rcp_f32_e32 v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -8445,60 +8436,61 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: v_madmk_f32 v1, v2, 0xcf800000, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_readfirstlane_b32 s2, v2 -; GFX9-NEXT: v_readfirstlane_b32 s3, v1 -; GFX9-NEXT: s_mul_i32 s12, s0, s2 -; GFX9-NEXT: s_mul_hi_u32 s14, s0, s3 -; GFX9-NEXT: s_mul_i32 s13, s1, s3 +; GFX9-NEXT: v_readfirstlane_b32 s10, v2 +; GFX9-NEXT: v_readfirstlane_b32 s11, v1 +; GFX9-NEXT: s_mul_i32 s12, s0, s10 +; GFX9-NEXT: s_mul_hi_u32 s14, s0, s11 +; GFX9-NEXT: s_mul_i32 s13, s1, s11 ; GFX9-NEXT: s_add_i32 s12, s14, s12 -; GFX9-NEXT: s_mul_i32 s15, s0, s3 +; GFX9-NEXT: s_mul_i32 s15, s0, s11 ; GFX9-NEXT: s_add_i32 s12, s12, s13 -; GFX9-NEXT: s_mul_hi_u32 s14, s3, s15 -; GFX9-NEXT: s_mul_hi_u32 s13, s3, s12 -; GFX9-NEXT: s_mul_i32 s3, s3, s12 -; GFX9-NEXT: s_add_u32 s3, s14, s3 +; GFX9-NEXT: s_mul_hi_u32 s14, s11, s15 +; GFX9-NEXT: s_mul_hi_u32 s13, s11, s12 +; GFX9-NEXT: s_mul_i32 s11, s11, s12 +; GFX9-NEXT: s_add_u32 s11, s14, s11 ; GFX9-NEXT: s_addc_u32 s13, 0, s13 -; GFX9-NEXT: s_mul_hi_u32 s16, s2, s15 -; GFX9-NEXT: s_mul_i32 s15, s2, s15 -; GFX9-NEXT: s_add_u32 s3, s3, s15 -; GFX9-NEXT: s_mul_hi_u32 s14, s2, s12 -; GFX9-NEXT: s_addc_u32 s3, s13, s16 +; GFX9-NEXT: s_mul_hi_u32 s16, s10, s15 +; GFX9-NEXT: s_mul_i32 s15, s10, s15 +; GFX9-NEXT: s_add_u32 s11, s11, s15 +; GFX9-NEXT: s_mul_hi_u32 s14, s10, s12 +; GFX9-NEXT: s_addc_u32 s11, s13, s16 ; GFX9-NEXT: s_addc_u32 s13, s14, 0 -; GFX9-NEXT: s_mul_i32 s12, s2, s12 -; GFX9-NEXT: s_add_u32 s3, s3, s12 +; GFX9-NEXT: s_mul_i32 s12, s10, s12 +; GFX9-NEXT: s_add_u32 s11, s11, s12 ; GFX9-NEXT: s_addc_u32 s12, 0, s13 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s3, v1 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s11, v1 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_addc_u32 s2, s2, s12 +; GFX9-NEXT: s_addc_u32 s10, s10, s12 ; GFX9-NEXT: v_readfirstlane_b32 s12, v1 -; GFX9-NEXT: s_mul_i32 s3, s0, s2 +; GFX9-NEXT: s_mul_i32 s11, s0, s10 ; GFX9-NEXT: s_mul_hi_u32 s13, s0, s12 -; GFX9-NEXT: s_add_i32 s3, s13, s3 +; GFX9-NEXT: s_add_i32 s11, s13, s11 ; GFX9-NEXT: s_mul_i32 s1, s1, s12 -; GFX9-NEXT: s_add_i32 s3, s3, s1 +; GFX9-NEXT: s_add_i32 s11, s11, s1 ; GFX9-NEXT: s_mul_i32 s0, s0, s12 -; GFX9-NEXT: s_mul_hi_u32 s13, s2, s0 -; GFX9-NEXT: s_mul_i32 s14, s2, s0 -; GFX9-NEXT: s_mul_i32 s16, s12, s3 +; GFX9-NEXT: s_mul_hi_u32 s13, s10, s0 +; GFX9-NEXT: s_mul_i32 s14, s10, s0 +; GFX9-NEXT: s_mul_i32 s16, s12, s11 ; GFX9-NEXT: s_mul_hi_u32 s0, s12, s0 -; GFX9-NEXT: s_mul_hi_u32 s15, s12, s3 +; GFX9-NEXT: s_mul_hi_u32 s15, s12, s11 ; GFX9-NEXT: s_add_u32 s0, s0, s16 ; GFX9-NEXT: s_addc_u32 s12, 0, s15 ; GFX9-NEXT: s_add_u32 s0, s0, s14 -; GFX9-NEXT: s_mul_hi_u32 s1, s2, s3 +; GFX9-NEXT: s_mul_hi_u32 s1, s10, s11 ; GFX9-NEXT: s_addc_u32 s0, s12, s13 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-NEXT: s_mul_i32 s3, s2, s3 -; GFX9-NEXT: s_add_u32 s0, s0, s3 +; GFX9-NEXT: s_mul_i32 s11, s10, s11 +; GFX9-NEXT: s_add_u32 s0, s0, s11 ; GFX9-NEXT: s_addc_u32 s1, 0, s1 ; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s0, v1 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_addc_u32 s12, s2, s1 -; GFX9-NEXT: s_ashr_i32 s2, s7, 31 -; GFX9-NEXT: s_add_u32 s0, s6, s2 -; GFX9-NEXT: s_mov_b32 s3, s2 -; GFX9-NEXT: s_addc_u32 s1, s7, s2 -; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[2:3] +; GFX9-NEXT: s_addc_u32 s12, s10, s1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_ashr_i32 s10, s7, 31 +; GFX9-NEXT: s_add_u32 s0, s6, s10 +; GFX9-NEXT: s_mov_b32 s11, s10 +; GFX9-NEXT: s_addc_u32 s1, s7, s10 +; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] ; GFX9-NEXT: v_readfirstlane_b32 s13, v1 ; GFX9-NEXT: s_mul_i32 s1, s6, s12 ; GFX9-NEXT: s_mul_hi_u32 s14, s6, s13 @@ -8514,24 +8506,24 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: s_mul_i32 s12, s7, s12 ; GFX9-NEXT: s_add_u32 s12, s0, s12 ; GFX9-NEXT: s_addc_u32 s13, 0, s1 -; GFX9-NEXT: s_mul_i32 s0, s10, s13 -; GFX9-NEXT: s_mul_hi_u32 s1, s10, s12 +; GFX9-NEXT: s_mul_i32 s0, s8, s13 +; GFX9-NEXT: s_mul_hi_u32 s1, s8, s12 ; GFX9-NEXT: s_add_i32 s0, s1, s0 -; GFX9-NEXT: s_mul_i32 s1, s11, s12 +; GFX9-NEXT: s_mul_i32 s1, s9, s12 ; GFX9-NEXT: s_add_i32 s14, s0, s1 -; GFX9-NEXT: s_mul_i32 s1, s10, s12 +; GFX9-NEXT: s_mul_i32 s1, s8, s12 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_sub_i32 s0, s7, s14 ; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s6, v1 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_subb_u32 s6, s0, s11 -; GFX9-NEXT: v_subrev_co_u32_e64 v2, s[0:1], s10, v1 +; GFX9-NEXT: s_subb_u32 s6, s0, s9 +; GFX9-NEXT: v_subrev_co_u32_e64 v2, s[0:1], s8, v1 ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_subb_u32 s6, s6, 0 -; GFX9-NEXT: s_cmp_ge_u32 s6, s11 +; GFX9-NEXT: s_cmp_ge_u32 s6, s9 ; GFX9-NEXT: s_cselect_b32 s15, -1, 0 -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v2 -; GFX9-NEXT: s_cmp_eq_u32 s6, s11 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v2 +; GFX9-NEXT: s_cmp_eq_u32 s6, s9 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v3, s15 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 @@ -8549,10 +8541,10 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] ; GFX9-NEXT: s_subb_u32 s0, s7, s14 -; GFX9-NEXT: s_cmp_ge_u32 s0, s11 +; GFX9-NEXT: s_cmp_ge_u32 s0, s9 ; GFX9-NEXT: s_cselect_b32 s1, -1, 0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v1 -; GFX9-NEXT: s_cmp_eq_u32 s0, s11 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v1 +; GFX9-NEXT: s_cmp_eq_u32 s0, s9 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; GFX9-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 @@ -8562,7 +8554,7 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc ; GFX9-NEXT: v_mov_b32_e32 v3, s12 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], s[8:9] +; GFX9-NEXT: s_xor_b64 s[0:1], s[10:11], s[2:3] ; GFX9-NEXT: v_xor_b32_e32 v2, s0, v2 ; GFX9-NEXT: v_xor_b32_e32 v3, s1, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, s1 @@ -8589,8 +8581,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i6 ; ; GFX6-LABEL: sdiv_v2i64_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8613,25 +8605,25 @@ define amdgpu_kernel void @sdiv_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i6 ; ; GFX9-LABEL: sdiv_v2i64_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s2, s5, 31 -; GFX9-NEXT: s_lshr_b32 s2, s2, 20 -; GFX9-NEXT: s_add_u32 s2, s4, s2 -; GFX9-NEXT: s_addc_u32 s3, s5, 0 +; GFX9-NEXT: s_ashr_i32 s0, s5, 31 +; GFX9-NEXT: s_lshr_b32 s0, s0, 20 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, 0 ; GFX9-NEXT: s_ashr_i32 s4, s7, 31 -; GFX9-NEXT: s_ashr_i64 s[2:3], s[2:3], 12 +; GFX9-NEXT: s_ashr_i64 s[0:1], s[0:1], 12 ; GFX9-NEXT: s_lshr_b32 s4, s4, 20 ; GFX9-NEXT: s_add_u32 s4, s6, s4 ; GFX9-NEXT: s_addc_u32 s5, s7, 0 ; GFX9-NEXT: s_ashr_i64 s[4:5], s[4:5], 12 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX9-NEXT: s_endpgm %r = sdiv <2 x i64> %x, store <2 x i64> %r, ptr addrspace(1) %out @@ -8651,8 +8643,8 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(ptr addrspace(1) %out, ; ; GFX6-LABEL: ssdiv_v2i64_mixed_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s2, 0x2ff2fc01 ; GFX6-NEXT: v_bfrev_b32_e32 v0, 7 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -8752,17 +8744,17 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(ptr addrspace(1) %out, ; ; GFX9-LABEL: ssdiv_v2i64_mixed_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s8, 0x2ff2fc01 ; GFX9-NEXT: v_bfrev_b32_e32 v0, 7 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s2, s5, 31 -; GFX9-NEXT: s_lshr_b32 s2, s2, 20 -; GFX9-NEXT: s_add_u32 s2, s4, s2 -; GFX9-NEXT: s_addc_u32 s3, s5, 0 -; GFX9-NEXT: s_ashr_i64 s[2:3], s[2:3], 12 +; GFX9-NEXT: s_ashr_i32 s0, s5, 31 +; GFX9-NEXT: s_lshr_b32 s0, s0, 20 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, 0 +; GFX9-NEXT: s_ashr_i64 s[0:1], s[0:1], 12 ; GFX9-NEXT: s_add_u32 s4, 0xe037f, s8 ; GFX9-NEXT: s_addc_u32 s5, 0, 0 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v0 @@ -8846,11 +8838,11 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(ptr addrspace(1) %out, ; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5] ; GFX9-NEXT: s_sub_u32 s5, s6, s4 ; GFX9-NEXT: s_subb_u32 s4, s7, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX9-NEXT: s_endpgm %r = sdiv <2 x i64> %x, store <2 x i64> %r, ptr addrspace(1) %out @@ -8873,36 +8865,37 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX6-LABEL: sdiv_v2i64_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NEXT: s_lshl_b64 s[0:1], 0x1000, s8 -; GFX6-NEXT: s_lshl_b64 s[14:15], 0x1000, s10 -; GFX6-NEXT: s_ashr_i32 s16, s1, 31 -; GFX6-NEXT: s_add_u32 s0, s0, s16 -; GFX6-NEXT: s_mov_b32 s17, s16 -; GFX6-NEXT: s_addc_u32 s1, s1, s16 -; GFX6-NEXT: s_xor_b64 s[12:13], s[0:1], s[16:17] -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s12 -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s13 -; GFX6-NEXT: s_sub_u32 s0, 0, s12 -; GFX6-NEXT: s_subb_u32 s1, 0, s13 -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 +; GFX6-NEXT: s_lshl_b64 s[2:3], 0x1000, s8 +; GFX6-NEXT: s_lshl_b64 s[12:13], 0x1000, s10 +; GFX6-NEXT: s_ashr_i32 s14, s3, 31 +; GFX6-NEXT: s_add_u32 s2, s2, s14 +; GFX6-NEXT: s_mov_b32 s15, s14 +; GFX6-NEXT: s_addc_u32 s3, s3, s14 +; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[14:15] +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s3 +; GFX6-NEXT: s_sub_u32 s10, 0, s2 +; GFX6-NEXT: s_subb_u32 s11, 0, s3 +; GFX6-NEXT: s_ashr_i32 s16, s5, 31 ; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 -; GFX6-NEXT: s_ashr_i32 s2, s5, 31 -; GFX6-NEXT: s_mov_b32 s3, s2 -; GFX6-NEXT: s_mov_b32 s10, -1 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 +; GFX6-NEXT: s_add_u32 s0, s4, s16 +; GFX6-NEXT: s_mov_b32 s17, s16 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s0, v0 -; GFX6-NEXT: v_mul_lo_u32 v5, s1, v0 -; GFX6-NEXT: v_mul_lo_u32 v4, s0, v0 +; GFX6-NEXT: s_addc_u32 s1, s5, s16 +; GFX6-NEXT: s_xor_b64 s[4:5], s[0:1], s[16:17] +; GFX6-NEXT: v_mul_lo_u32 v2, s10, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s10, v0 +; GFX6-NEXT: v_mul_lo_u32 v5, s11, v0 +; GFX6-NEXT: v_mul_lo_u32 v4, s10, v0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 @@ -8921,11 +8914,12 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s0, v0 -; GFX6-NEXT: v_mul_lo_u32 v4, s1, v0 +; GFX6-NEXT: v_mul_lo_u32 v2, s10, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s10, v0 +; GFX6-NEXT: v_mul_lo_u32 v4, s11, v0 +; GFX6-NEXT: s_mov_b32 s11, 0xf000 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_mul_lo_u32 v3, s0, v0 +; GFX6-NEXT: v_mul_lo_u32 v3, s10, v0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 @@ -8941,11 +8935,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GFX6-NEXT: s_add_u32 s0, s4, s2 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: s_addc_u32 s1, s5, s2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX6-NEXT: s_xor_b64 s[4:5], s[0:1], s[2:3] ; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1 ; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 ; GFX6-NEXT: v_mul_hi_u32 v4, s4, v1 @@ -8955,28 +8946,29 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_mul_lo_u32 v4, s5, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 +; GFX6-NEXT: s_mov_b32 s10, -1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, s12, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s12, v0 -; GFX6-NEXT: v_mul_lo_u32 v4, s13, v0 -; GFX6-NEXT: v_mov_b32_e32 v5, s13 +; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 +; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0 +; GFX6-NEXT: v_mov_b32_e32 v5, s3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_mul_lo_u32 v3, s12, v0 +; GFX6-NEXT: v_mul_lo_u32 v3, s2, v0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s5, v2 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s4, v3 ; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc -; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s12, v3 +; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s2, v3 ; GFX6-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v4 +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v5 +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v5 ; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v4 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] ; GFX6-NEXT: v_add_i32_e64 v5, s[0:1], 1, v0 ; GFX6-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1] @@ -8985,23 +8977,23 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v4, v5, v7, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v5, v6, v8, s[0:1] -; GFX6-NEXT: s_xor_b64 s[0:1], s[2:3], s[16:17] -; GFX6-NEXT: s_ashr_i32 s2, s15, 31 -; GFX6-NEXT: s_add_u32 s4, s14, s2 +; GFX6-NEXT: s_xor_b64 s[0:1], s[16:17], s[14:15] +; GFX6-NEXT: s_ashr_i32 s4, s13, 31 +; GFX6-NEXT: s_add_u32 s12, s12, s4 ; GFX6-NEXT: v_mov_b32_e32 v6, s5 -; GFX6-NEXT: s_mov_b32 s3, s2 -; GFX6-NEXT: s_addc_u32 s5, s15, s2 -; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3] +; GFX6-NEXT: s_mov_b32 s5, s4 +; GFX6-NEXT: s_addc_u32 s13, s13, s4 +; GFX6-NEXT: s_xor_b64 s[12:13], s[12:13], s[4:5] ; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v6, v2, vcc -; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s4 -; GFX6-NEXT: v_cvt_f32_u32_e32 v7, s5 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s13, v2 +; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s12 +; GFX6-NEXT: v_cvt_f32_u32_e32 v7, s13 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 ; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s12, v3 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s2, v3 ; GFX6-NEXT: v_mac_f32_e32 v6, 0x4f800000, v7 ; GFX6-NEXT: v_rcp_f32_e32 v6, v6 ; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s13, v2 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s3, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v8, v3, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GFX6-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v6 @@ -9010,16 +9002,16 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX6-NEXT: s_sub_u32 s12, 0, s4 +; GFX6-NEXT: s_sub_u32 s2, 0, s12 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX6-NEXT: v_mul_hi_u32 v4, s12, v2 -; GFX6-NEXT: v_mul_lo_u32 v5, s12, v3 -; GFX6-NEXT: s_subb_u32 s13, 0, s5 -; GFX6-NEXT: v_mul_lo_u32 v6, s13, v2 +; GFX6-NEXT: v_mul_hi_u32 v4, s2, v2 +; GFX6-NEXT: v_mul_lo_u32 v5, s2, v3 +; GFX6-NEXT: s_subb_u32 s3, 0, s13 +; GFX6-NEXT: v_mul_lo_u32 v6, s3, v2 ; GFX6-NEXT: v_xor_b32_e32 v0, s0, v0 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GFX6-NEXT: v_mul_lo_u32 v5, s12, v2 +; GFX6-NEXT: v_mul_lo_u32 v5, s2, v2 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; GFX6-NEXT: v_mul_lo_u32 v6, v2, v4 ; GFX6-NEXT: v_mul_hi_u32 v7, v2, v5 @@ -9038,11 +9030,11 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc -; GFX6-NEXT: v_mul_lo_u32 v4, s12, v3 -; GFX6-NEXT: v_mul_hi_u32 v5, s12, v2 -; GFX6-NEXT: v_mul_lo_u32 v6, s13, v2 +; GFX6-NEXT: v_mul_lo_u32 v4, s2, v3 +; GFX6-NEXT: v_mul_hi_u32 v5, s2, v2 +; GFX6-NEXT: v_mul_lo_u32 v6, s3, v2 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GFX6-NEXT: v_mul_lo_u32 v5, s12, v2 +; GFX6-NEXT: v_mul_lo_u32 v5, s2, v2 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; GFX6-NEXT: v_mul_lo_u32 v8, v2, v4 ; GFX6-NEXT: v_mul_hi_u32 v9, v2, v5 @@ -9057,14 +9049,14 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v9, v7, vcc ; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GFX6-NEXT: s_ashr_i32 s12, s7, 31 +; GFX6-NEXT: s_ashr_i32 s2, s7, 31 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc -; GFX6-NEXT: s_add_u32 s6, s6, s12 +; GFX6-NEXT: s_add_u32 s6, s6, s2 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GFX6-NEXT: s_mov_b32 s13, s12 -; GFX6-NEXT: s_addc_u32 s7, s7, s12 +; GFX6-NEXT: s_mov_b32 s3, s2 +; GFX6-NEXT: s_addc_u32 s7, s7, s2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc -; GFX6-NEXT: s_xor_b64 s[6:7], s[6:7], s[12:13] +; GFX6-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3] ; GFX6-NEXT: v_mul_lo_u32 v4, s6, v3 ; GFX6-NEXT: v_mul_hi_u32 v5, s6, v2 ; GFX6-NEXT: v_mul_hi_u32 v7, s6, v3 @@ -9080,25 +9072,25 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GFX6-NEXT: v_mul_lo_u32 v4, s4, v3 -; GFX6-NEXT: v_mul_hi_u32 v5, s4, v2 +; GFX6-NEXT: v_mul_lo_u32 v4, s12, v3 +; GFX6-NEXT: v_mul_hi_u32 v5, s12, v2 ; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 ; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc -; GFX6-NEXT: v_mul_lo_u32 v6, s5, v2 +; GFX6-NEXT: v_mul_lo_u32 v6, s13, v2 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GFX6-NEXT: v_mul_lo_u32 v5, s4, v2 +; GFX6-NEXT: v_mul_lo_u32 v5, s12, v2 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v6, v4 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s7, v4 -; GFX6-NEXT: v_mov_b32_e32 v7, s5 +; GFX6-NEXT: v_mov_b32_e32 v7, s13 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s6, v5 ; GFX6-NEXT: v_subb_u32_e64 v6, s[0:1], v6, v7, vcc -; GFX6-NEXT: v_subrev_i32_e64 v7, s[0:1], s4, v5 +; GFX6-NEXT: v_subrev_i32_e64 v7, s[0:1], s12, v5 ; GFX6-NEXT: v_subbrev_u32_e64 v6, s[0:1], 0, v6, s[0:1] -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v6 +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v6 ; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v7 +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v7 ; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, v6 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v6 ; GFX6-NEXT: v_cndmask_b32_e64 v6, v8, v7, s[0:1] ; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 1, v2 ; GFX6-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v3, s[0:1] @@ -9109,15 +9101,15 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_cndmask_b32_e64 v7, v8, v10, s[0:1] ; GFX6-NEXT: v_mov_b32_e32 v8, s7 ; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v8, v4, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s5, v4 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s13, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s4, v5 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s12, v5 ; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s5, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s13, v4 ; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v5, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; GFX6-NEXT: s_xor_b64 s[0:1], s[12:13], s[2:3] +; GFX6-NEXT: s_xor_b64 s[0:1], s[2:3], s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc ; GFX6-NEXT: v_xor_b32_e32 v2, s0, v2 ; GFX6-NEXT: v_xor_b32_e32 v3, s1, v3 @@ -9130,19 +9122,19 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX9-LABEL: sdiv_v2i64_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s8 +; GFX9-NEXT: s_lshl_b64 s[2:3], 0x1000, s8 ; GFX9-NEXT: s_lshl_b64 s[10:11], 0x1000, s10 -; GFX9-NEXT: s_ashr_i32 s8, s1, 31 -; GFX9-NEXT: s_add_u32 s0, s0, s8 +; GFX9-NEXT: s_ashr_i32 s8, s3, 31 +; GFX9-NEXT: s_add_u32 s2, s2, s8 ; GFX9-NEXT: s_mov_b32 s9, s8 -; GFX9-NEXT: s_addc_u32 s1, s1, s8 -; GFX9-NEXT: s_xor_b64 s[12:13], s[0:1], s[8:9] +; GFX9-NEXT: s_addc_u32 s3, s3, s8 +; GFX9-NEXT: s_xor_b64 s[12:13], s[2:3], s[8:9] ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s12 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s13 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_sub_u32 s0, 0, s12 ; GFX9-NEXT: s_subb_u32 s1, 0, s13 ; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 @@ -9416,6 +9408,7 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: v_mov_b32_e32 v6, s1 ; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s0, v3 ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v5, v6, vcc +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX9-NEXT: s_endpgm %shl.y = shl <2 x i64> , %y @@ -9432,6 +9425,7 @@ define amdgpu_kernel void @srem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX6-LABEL: srem_i64_oddk_denom: ; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s0, 0x33fe64 ; GFX6-NEXT: s_add_u32 s0, 0x396, s0 ; GFX6-NEXT: v_mov_b32_e32 v0, 0x28100000 @@ -9451,7 +9445,6 @@ define amdgpu_kernel void @srem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { ; GFX6-NEXT: v_mul_hi_u32 v7, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v4, s1, v2 ; GFX6-NEXT: v_mul_lo_u32 v2, s1, v2 -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; GFX6-NEXT: v_mul_hi_u32 v3, s1, v1 ; GFX6-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v7, vcc @@ -9546,7 +9539,7 @@ define amdgpu_kernel void @srem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { ; GFX9-NEXT: s_mul_i32 s10, s4, s8 ; GFX9-NEXT: s_addc_u32 s8, 0, s11 ; GFX9-NEXT: s_add_u32 s6, s6, s10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mul_hi_u32 s7, s4, s5 ; GFX9-NEXT: s_addc_u32 s6, s8, s9 ; GFX9-NEXT: s_addc_u32 s7, s7, 0 @@ -9633,7 +9626,7 @@ define amdgpu_kernel void @srem_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX6-LABEL: srem_i64_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9653,7 +9646,7 @@ define amdgpu_kernel void @srem_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX9-LABEL: srem_i64_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s4, s3, 31 @@ -9681,21 +9674,21 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; ; GFX6-LABEL: srem_i64_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s0, s[2:3], 0xd +; GFX6-NEXT: s_load_dword s2, s[0:1], 0xd ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b64 s[0:1], 0x1000, s0 -; GFX6-NEXT: s_ashr_i32 s4, s1, 31 -; GFX6-NEXT: s_add_u32 s0, s0, s4 +; GFX6-NEXT: s_lshl_b64 s[2:3], 0x1000, s2 +; GFX6-NEXT: s_ashr_i32 s4, s3, 31 +; GFX6-NEXT: s_add_u32 s2, s2, s4 ; GFX6-NEXT: s_mov_b32 s5, s4 -; GFX6-NEXT: s_addc_u32 s1, s1, s4 -; GFX6-NEXT: s_xor_b64 s[8:9], s[0:1], s[4:5] +; GFX6-NEXT: s_addc_u32 s3, s3, s4 +; GFX6-NEXT: s_xor_b64 s[8:9], s[2:3], s[4:5] ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 ; GFX6-NEXT: s_sub_u32 s4, 0, s8 ; GFX6-NEXT: s_subb_u32 s5, 0, s9 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9815,17 +9808,17 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; ; GFX9-LABEL: srem_i64_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s0 -; GFX9-NEXT: s_ashr_i32 s4, s1, 31 -; GFX9-NEXT: s_add_u32 s0, s0, s4 +; GFX9-NEXT: s_lshl_b64 s[2:3], 0x1000, s2 +; GFX9-NEXT: s_ashr_i32 s4, s3, 31 +; GFX9-NEXT: s_add_u32 s2, s2, s4 ; GFX9-NEXT: s_mov_b32 s5, s4 -; GFX9-NEXT: s_addc_u32 s1, s1, s4 -; GFX9-NEXT: s_xor_b64 s[8:9], s[0:1], s[4:5] +; GFX9-NEXT: s_addc_u32 s3, s3, s4 +; GFX9-NEXT: s_xor_b64 s[8:9], s[2:3], s[4:5] ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_sub_u32 s0, 0, s8 ; GFX9-NEXT: s_subb_u32 s1, 0, s9 ; GFX9-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 @@ -9979,8 +9972,8 @@ define amdgpu_kernel void @srem_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i6 ; ; GFX6-LABEL: srem_v2i64_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -10007,17 +10000,17 @@ define amdgpu_kernel void @srem_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i6 ; ; GFX9-LABEL: srem_v2i64_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s2, s5, 31 -; GFX9-NEXT: s_lshr_b32 s2, s2, 20 -; GFX9-NEXT: s_add_u32 s2, s4, s2 -; GFX9-NEXT: s_addc_u32 s3, s5, 0 -; GFX9-NEXT: s_and_b32 s2, s2, 0xfffff000 -; GFX9-NEXT: s_sub_u32 s2, s4, s2 -; GFX9-NEXT: s_subb_u32 s3, s5, s3 +; GFX9-NEXT: s_ashr_i32 s0, s5, 31 +; GFX9-NEXT: s_lshr_b32 s0, s0, 20 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, 0 +; GFX9-NEXT: s_and_b32 s0, s0, 0xfffff000 +; GFX9-NEXT: s_sub_u32 s0, s4, s0 +; GFX9-NEXT: s_subb_u32 s1, s5, s1 ; GFX9-NEXT: s_ashr_i32 s4, s7, 31 ; GFX9-NEXT: s_lshr_b32 s4, s4, 20 ; GFX9-NEXT: s_add_u32 s4, s6, s4 @@ -10025,11 +10018,11 @@ define amdgpu_kernel void @srem_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i6 ; GFX9-NEXT: s_and_b32 s4, s4, 0xfffff000 ; GFX9-NEXT: s_sub_u32 s4, s6, s4 ; GFX9-NEXT: s_subb_u32 s5, s7, s5 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX9-NEXT: s_endpgm %r = srem <2 x i64> %x, store <2 x i64> %r, ptr addrspace(1) %out @@ -10052,36 +10045,39 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX6-LABEL: srem_v2i64_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NEXT: s_lshl_b64 s[0:1], 0x1000, s8 +; GFX6-NEXT: s_lshl_b64 s[2:3], 0x1000, s8 ; GFX6-NEXT: s_lshl_b64 s[16:17], 0x1000, s10 -; GFX6-NEXT: s_ashr_i32 s8, s1, 31 -; GFX6-NEXT: s_add_u32 s0, s0, s8 +; GFX6-NEXT: s_ashr_i32 s8, s3, 31 +; GFX6-NEXT: s_add_u32 s2, s2, s8 ; GFX6-NEXT: s_mov_b32 s9, s8 -; GFX6-NEXT: s_addc_u32 s1, s1, s8 -; GFX6-NEXT: s_xor_b64 s[14:15], s[0:1], s[8:9] +; GFX6-NEXT: s_addc_u32 s3, s3, s8 +; GFX6-NEXT: s_xor_b64 s[14:15], s[2:3], s[8:9] ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s14 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s15 -; GFX6-NEXT: s_sub_u32 s0, 0, s14 -; GFX6-NEXT: s_subb_u32 s1, 0, s15 +; GFX6-NEXT: s_sub_u32 s2, 0, s14 +; GFX6-NEXT: s_subb_u32 s3, 0, s15 ; GFX6-NEXT: s_ashr_i32 s12, s5, 31 ; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 +; GFX6-NEXT: s_add_u32 s0, s4, s12 ; GFX6-NEXT: s_mov_b32 s13, s12 -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 -; GFX6-NEXT: s_mov_b32 s10, -1 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s0, v0 -; GFX6-NEXT: v_mul_lo_u32 v5, s1, v0 -; GFX6-NEXT: v_mul_lo_u32 v4, s0, v0 +; GFX6-NEXT: s_addc_u32 s1, s5, s12 +; GFX6-NEXT: s_xor_b64 s[4:5], s[0:1], s[12:13] +; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 +; GFX6-NEXT: v_mul_lo_u32 v5, s3, v0 +; GFX6-NEXT: v_mul_lo_u32 v4, s2, v0 +; GFX6-NEXT: s_mov_b32 s10, -1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 @@ -10100,11 +10096,11 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s0, v0 -; GFX6-NEXT: v_mul_lo_u32 v4, s1, v0 +; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 +; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_mul_lo_u32 v3, s0, v0 +; GFX6-NEXT: v_mul_lo_u32 v3, s2, v0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 @@ -10120,11 +10116,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GFX6-NEXT: s_add_u32 s0, s4, s12 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: s_addc_u32 s1, s5, s12 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX6-NEXT: s_xor_b64 s[4:5], s[0:1], s[12:13] ; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1 ; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 ; GFX6-NEXT: v_mul_hi_u32 v4, s4, v1 @@ -10305,19 +10298,19 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX9-LABEL: srem_v2i64_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s8 +; GFX9-NEXT: s_lshl_b64 s[2:3], 0x1000, s8 ; GFX9-NEXT: s_lshl_b64 s[10:11], 0x1000, s10 -; GFX9-NEXT: s_ashr_i32 s8, s1, 31 -; GFX9-NEXT: s_add_u32 s0, s0, s8 +; GFX9-NEXT: s_ashr_i32 s8, s3, 31 +; GFX9-NEXT: s_add_u32 s2, s2, s8 ; GFX9-NEXT: s_mov_b32 s9, s8 -; GFX9-NEXT: s_addc_u32 s1, s1, s8 -; GFX9-NEXT: s_xor_b64 s[12:13], s[0:1], s[8:9] +; GFX9-NEXT: s_addc_u32 s3, s3, s8 +; GFX9-NEXT: s_xor_b64 s[12:13], s[2:3], s[8:9] ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s12 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s13 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; GFX9-NEXT: s_sub_u32 s0, 0, s12 ; GFX9-NEXT: s_subb_u32 s1, 0, s13 ; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 diff --git a/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll b/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll index 417d38990505b..2c69ae58f0e61 100644 --- a/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll +++ b/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll @@ -9,7 +9,7 @@ declare i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr, i32) define amdgpu_kernel void @flat_atomic_cond_sub_no_rtn_u32(ptr %addr, i32 %in) { ; GFX12-SDAG-LABEL: flat_atomic_cond_sub_no_rtn_u32: ; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2 @@ -18,7 +18,7 @@ define amdgpu_kernel void @flat_atomic_cond_sub_no_rtn_u32(ptr %addr, i32 %in) { ; ; GFX12-GISEL-LABEL: flat_atomic_cond_sub_no_rtn_u32: ; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1 @@ -33,7 +33,7 @@ entry: define amdgpu_kernel void @flat_atomic_cond_sub_no_rtn_u32_forced(ptr %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" { ; GFX12-SDAG-LABEL: flat_atomic_cond_sub_no_rtn_u32_forced: ; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2 @@ -42,7 +42,7 @@ define amdgpu_kernel void @flat_atomic_cond_sub_no_rtn_u32_forced(ptr %addr, i32 ; ; GFX12-GISEL-LABEL: flat_atomic_cond_sub_no_rtn_u32_forced: ; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1 @@ -58,8 +58,8 @@ define amdgpu_kernel void @flat_atomic_cond_sub_rtn_u32(ptr %addr, i32 %in, ptr ; GFX12-SDAG-LABEL: flat_atomic_cond_sub_rtn_u32: ; GFX12-SDAG: ; %bb.0: ; %entry ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b96 s[4:6], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s6 @@ -72,8 +72,8 @@ define amdgpu_kernel void @flat_atomic_cond_sub_rtn_u32(ptr %addr, i32 %in, ptr ; GFX12-GISEL-LABEL: flat_atomic_cond_sub_rtn_u32: ; GFX12-GISEL: ; %bb.0: ; %entry ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b96 s[4:6], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v1, s5 @@ -92,7 +92,7 @@ entry: define amdgpu_kernel void @global_atomic_cond_sub_no_rtn_u32(ptr addrspace(1) %addr, i32 %in) { ; GFX12-SDAG-LABEL: global_atomic_cond_sub_no_rtn_u32: ; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v0, v1, s[0:1] offset:-16 th:TH_ATOMIC_RETURN @@ -100,7 +100,7 @@ define amdgpu_kernel void @global_atomic_cond_sub_no_rtn_u32(ptr addrspace(1) %a ; ; GFX12-GISEL-LABEL: global_atomic_cond_sub_no_rtn_u32: ; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v1, v0, s[0:1] offset:-16 th:TH_ATOMIC_RETURN @@ -114,7 +114,7 @@ entry: define amdgpu_kernel void @global_atomic_cond_sub_no_rtn_u32_forced(ptr addrspace(1) %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" { ; GFX12-SDAG-LABEL: global_atomic_cond_sub_no_rtn_u32_forced: ; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v1, s[0:1] offset:-16 @@ -124,7 +124,7 @@ define amdgpu_kernel void @global_atomic_cond_sub_no_rtn_u32_forced(ptr addrspac ; ; GFX12-GISEL-LABEL: global_atomic_cond_sub_no_rtn_u32_forced: ; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v1, v0, s[0:1] offset:-16 @@ -140,11 +140,11 @@ entry: define amdgpu_kernel void @global_atomic_cond_sub_rtn_u32(ptr addrspace(1) %addr, i32 %in, ptr addrspace(1) %use) { ; GFX12-SDAG-LABEL: global_atomic_cond_sub_rtn_u32: ; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b96 s[4:6], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s6 ; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v1, v0, v1, s[4:5] offset:16 th:TH_ATOMIC_RETURN ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] @@ -155,8 +155,8 @@ define amdgpu_kernel void @global_atomic_cond_sub_rtn_u32(ptr addrspace(1) %addr ; GFX12-GISEL-LABEL: global_atomic_cond_sub_rtn_u32: ; GFX12-GISEL: ; %bb.0: ; %entry ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b96 s[4:6], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v1, v0, s[4:5] offset:16 th:TH_ATOMIC_RETURN @@ -175,7 +175,7 @@ entry: define amdgpu_kernel void @ds_cond_sub_no_rtn_u32(ptr addrspace(3) %addr, i32 %in) { ; GFX12-SDAG-LABEL: ds_cond_sub_no_rtn_u32: ; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, -16 ; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -185,7 +185,7 @@ define amdgpu_kernel void @ds_cond_sub_no_rtn_u32(ptr addrspace(3) %addr, i32 %i ; ; GFX12-GISEL-LABEL: ds_cond_sub_no_rtn_u32: ; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, -16 ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -201,7 +201,7 @@ entry: define amdgpu_kernel void @ds_cond_sub_no_rtn_u32_forced(ptr addrspace(3) %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" { ; GFX12-SDAG-LABEL: ds_cond_sub_no_rtn_u32_forced: ; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, -16 ; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -211,7 +211,7 @@ define amdgpu_kernel void @ds_cond_sub_no_rtn_u32_forced(ptr addrspace(3) %addr, ; ; GFX12-GISEL-LABEL: ds_cond_sub_no_rtn_u32_forced: ; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, -16 ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -227,7 +227,7 @@ entry: define amdgpu_kernel void @ds_cond_sub_rtn_u32(ptr addrspace(3) %addr, i32 %in, ptr addrspace(3) %use) { ; GFX12-SDAG-LABEL: ds_cond_sub_rtn_u32: ; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-SDAG-NEXT: ds_cond_sub_rtn_u32 v0, v0, v1 offset:16 @@ -238,7 +238,7 @@ define amdgpu_kernel void @ds_cond_sub_rtn_u32(ptr addrspace(3) %addr, i32 %in, ; ; GFX12-GISEL-LABEL: ds_cond_sub_rtn_u32: ; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX12-GISEL-NEXT: ds_cond_sub_rtn_u32 v0, v1, v0 offset:16 diff --git a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll index f54ea615ca664..af4116bd6aae5 100644 --- a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll +++ b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @v_ubfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; SI-LABEL: v_ubfe_sub_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -23,7 +23,7 @@ define amdgpu_kernel void @v_ubfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: v_ubfe_sub_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -55,7 +55,7 @@ define amdgpu_kernel void @v_ubfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; SI-LABEL: v_ubfe_sub_multi_use_shl_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -78,7 +78,7 @@ define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, p ; ; VI-LABEL: v_ubfe_sub_multi_use_shl_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -115,7 +115,7 @@ define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, p define amdgpu_kernel void @s_ubfe_sub_i32(ptr addrspace(1) %out, i32 %src, i32 %width) #1 { ; SI-LABEL: s_ubfe_sub_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -131,7 +131,7 @@ define amdgpu_kernel void @s_ubfe_sub_i32(ptr addrspace(1) %out, i32 %src, i32 % ; ; VI-LABEL: s_ubfe_sub_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -155,7 +155,7 @@ define amdgpu_kernel void @s_ubfe_sub_i32(ptr addrspace(1) %out, i32 %src, i32 % define amdgpu_kernel void @s_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, i32 %src, i32 %width) #1 { ; SI-LABEL: s_ubfe_sub_multi_use_shl_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -175,7 +175,7 @@ define amdgpu_kernel void @s_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, i ; ; VI-LABEL: s_ubfe_sub_multi_use_shl_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -203,7 +203,7 @@ define amdgpu_kernel void @s_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, i define amdgpu_kernel void @v_sbfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; SI-LABEL: v_sbfe_sub_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -221,7 +221,7 @@ define amdgpu_kernel void @v_sbfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: v_sbfe_sub_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -253,7 +253,7 @@ define amdgpu_kernel void @v_sbfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @v_sbfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; SI-LABEL: v_sbfe_sub_multi_use_shl_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -276,7 +276,7 @@ define amdgpu_kernel void @v_sbfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, p ; ; VI-LABEL: v_sbfe_sub_multi_use_shl_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -313,7 +313,7 @@ define amdgpu_kernel void @v_sbfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, p define amdgpu_kernel void @s_sbfe_sub_i32(ptr addrspace(1) %out, i32 %src, i32 %width) #1 { ; SI-LABEL: s_sbfe_sub_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -329,7 +329,7 @@ define amdgpu_kernel void @s_sbfe_sub_i32(ptr addrspace(1) %out, i32 %src, i32 % ; ; VI-LABEL: s_sbfe_sub_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -353,7 +353,7 @@ define amdgpu_kernel void @s_sbfe_sub_i32(ptr addrspace(1) %out, i32 %src, i32 % define amdgpu_kernel void @s_sbfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, i32 %src, i32 %width) #1 { ; SI-LABEL: s_sbfe_sub_multi_use_shl_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -373,7 +373,7 @@ define amdgpu_kernel void @s_sbfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, i ; ; VI-LABEL: s_sbfe_sub_multi_use_shl_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -401,8 +401,8 @@ define amdgpu_kernel void @s_sbfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, i define amdgpu_kernel void @s_sbfe_or_shl_shl_uniform_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: s_sbfe_or_shl_shl_uniform_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s2, s[6:7], 0x0 ; SI-NEXT: s_load_dword s0, s[0:1], 0x0 @@ -417,8 +417,8 @@ define amdgpu_kernel void @s_sbfe_or_shl_shl_uniform_i32(ptr addrspace(1) %out, ; ; VI-LABEL: s_sbfe_or_shl_shl_uniform_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[6:7], 0x0 ; VI-NEXT: s_load_dword s0, s[0:1], 0x0 @@ -444,8 +444,8 @@ define amdgpu_kernel void @s_sbfe_or_shl_shl_uniform_i32(ptr addrspace(1) %out, define amdgpu_kernel void @s_sbfe_or_shl_shl_nonuniform_i32(ptr addrspace(1) %out, ptr addrspace(1) %x, ptr addrspace(1) %y) { ; SI-LABEL: s_sbfe_or_shl_shl_nonuniform_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s2, s[6:7], 0x0 ; SI-NEXT: s_load_dword s0, s[0:1], 0x0 @@ -462,8 +462,8 @@ define amdgpu_kernel void @s_sbfe_or_shl_shl_nonuniform_i32(ptr addrspace(1) %ou ; ; VI-LABEL: s_sbfe_or_shl_shl_nonuniform_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[6:7], 0x0 ; VI-NEXT: s_load_dword s0, s[0:1], 0x0 @@ -491,8 +491,8 @@ define amdgpu_kernel void @s_sbfe_or_shl_shl_nonuniform_i32(ptr addrspace(1) %ou define amdgpu_kernel void @s_sbfe_or_shl_shl_toosmall_i32(ptr addrspace(1) %out, ptr addrspace(1) %x, ptr addrspace(1) %y) { ; SI-LABEL: s_sbfe_or_shl_shl_toosmall_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s2, s[6:7], 0x0 ; SI-NEXT: s_load_dword s0, s[0:1], 0x0 @@ -509,8 +509,8 @@ define amdgpu_kernel void @s_sbfe_or_shl_shl_toosmall_i32(ptr addrspace(1) %out, ; ; VI-LABEL: s_sbfe_or_shl_shl_toosmall_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[6:7], 0x0 ; VI-NEXT: s_load_dword s0, s[0:1], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/bfm.ll b/llvm/test/CodeGen/AMDGPU/bfm.ll index 2e64db12ef564..f8bd44b7c98f5 100644 --- a/llvm/test/CodeGen/AMDGPU/bfm.ll +++ b/llvm/test/CodeGen/AMDGPU/bfm.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @s_bfm_pattern(ptr addrspace(1) %out, i32 %x, i32 %y) #0 { ; SI-LABEL: s_bfm_pattern: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bfm_b32 s2, s2, s3 @@ -18,7 +18,7 @@ define amdgpu_kernel void @s_bfm_pattern(ptr addrspace(1) %out, i32 %x, i32 %y) ; ; VI-LABEL: s_bfm_pattern: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bfm_b32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -36,11 +36,11 @@ define amdgpu_kernel void @s_bfm_pattern(ptr addrspace(1) %out, i32 %x, i32 %y) define amdgpu_kernel void @s_bfm_pattern_simple(ptr addrspace(1) %out, i32 %x) #0 { ; SI-LABEL: s_bfm_pattern_simple: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfm_b32 s4, s4, 0 +; SI-NEXT: s_bfm_b32 s4, s2, 0 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -48,10 +48,10 @@ define amdgpu_kernel void @s_bfm_pattern_simple(ptr addrspace(1) %out, i32 %x) # ; ; VI-LABEL: s_bfm_pattern_simple: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bfm_b32 s2, s4, 0 +; VI-NEXT: s_bfm_b32 s2, s2, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 diff --git a/llvm/test/CodeGen/AMDGPU/bitreverse.ll b/llvm/test/CodeGen/AMDGPU/bitreverse.ll index 6f52da2631b8a..64555f14a55cc 100644 --- a/llvm/test/CodeGen/AMDGPU/bitreverse.ll +++ b/llvm/test/CodeGen/AMDGPU/bitreverse.ll @@ -21,8 +21,8 @@ declare <4 x i64> @llvm.bitreverse.v4i64(<4 x i64>) #1 define amdgpu_kernel void @s_brev_i16(ptr addrspace(1) noalias %out, i16 %val) #0 { ; SI-LABEL: s_brev_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -34,8 +34,8 @@ define amdgpu_kernel void @s_brev_i16(ptr addrspace(1) noalias %out, i16 %val) # ; ; FLAT-LABEL: s_brev_i16: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dword s4, s[2:3], 0x2c -; FLAT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; FLAT-NEXT: s_load_dword s4, s[0:1], 0x2c +; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; FLAT-NEXT: s_mov_b32 s3, 0xf000 ; FLAT-NEXT: s_mov_b32 s2, -1 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) @@ -47,10 +47,10 @@ define amdgpu_kernel void @s_brev_i16(ptr addrspace(1) noalias %out, i16 %val) # ; ; GISEL-LABEL: s_brev_i16: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: s_and_b32 s2, s4, 0xffff +; GISEL-NEXT: s_and_b32 s2, s2, 0xffff ; GISEL-NEXT: s_brev_b32 s2, s2 ; GISEL-NEXT: s_lshr_b32 s2, s2, 16 ; GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -62,10 +62,10 @@ define amdgpu_kernel void @s_brev_i16(ptr addrspace(1) noalias %out, i16 %val) # ; GFX11-FLAT-LABEL: s_brev_i16: ; GFX11-FLAT: ; %bb.0: ; GFX11-FLAT-NEXT: s_clause 0x1 -; GFX11-FLAT-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-FLAT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-FLAT-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-FLAT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FLAT-NEXT: s_brev_b32 s2, s4 +; GFX11-FLAT-NEXT: s_brev_b32 s2, s2 ; GFX11-FLAT-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FLAT-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-FLAT-NEXT: global_store_d16_hi_b16 v0, v1, s[0:1] @@ -76,11 +76,11 @@ define amdgpu_kernel void @s_brev_i16(ptr addrspace(1) noalias %out, i16 %val) # ; GFX11-GISEL-LABEL: s_brev_i16: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_and_b32 s2, s4, 0xffff +; GFX11-GISEL-NEXT: s_and_b32 s2, s2, 0xffff ; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: s_brev_b32 s2, s2 ; GFX11-GISEL-NEXT: s_lshr_b32 s2, s2, 16 @@ -98,7 +98,7 @@ define amdgpu_kernel void @s_brev_i16(ptr addrspace(1) noalias %out, i16 %val) # define amdgpu_kernel void @v_brev_i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) #0 { ; SI-LABEL: v_brev_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -117,7 +117,7 @@ define amdgpu_kernel void @v_brev_i16(ptr addrspace(1) noalias %out, ptr addrspa ; ; FLAT-LABEL: v_brev_i16: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; FLAT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; FLAT-NEXT: s_mov_b32 s7, 0xf000 ; FLAT-NEXT: s_mov_b32 s6, -1 ; FLAT-NEXT: s_mov_b32 s10, s6 @@ -136,7 +136,7 @@ define amdgpu_kernel void @v_brev_i16(ptr addrspace(1) noalias %out, ptr addrspa ; ; GISEL-LABEL: v_brev_i16: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -151,7 +151,7 @@ define amdgpu_kernel void @v_brev_i16(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX11-FLAT-LABEL: v_brev_i16: ; GFX11-FLAT: ; %bb.0: -; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FLAT-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FLAT-NEXT: s_mov_b32 s6, -1 ; GFX11-FLAT-NEXT: v_mov_b32_e32 v1, 0 @@ -168,7 +168,7 @@ define amdgpu_kernel void @v_brev_i16(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX11-GISEL-LABEL: v_brev_i16: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_u16 v1, v0, s[2:3] @@ -187,8 +187,8 @@ define amdgpu_kernel void @v_brev_i16(ptr addrspace(1) noalias %out, ptr addrspa define amdgpu_kernel void @s_brev_i32(ptr addrspace(1) noalias %out, i32 %val) #0 { ; SI-LABEL: s_brev_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -199,8 +199,8 @@ define amdgpu_kernel void @s_brev_i32(ptr addrspace(1) noalias %out, i32 %val) # ; ; FLAT-LABEL: s_brev_i32: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dword s4, s[2:3], 0x2c -; FLAT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; FLAT-NEXT: s_load_dword s4, s[0:1], 0x2c +; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; FLAT-NEXT: s_mov_b32 s3, 0xf000 ; FLAT-NEXT: s_mov_b32 s2, -1 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) @@ -211,10 +211,10 @@ define amdgpu_kernel void @s_brev_i32(ptr addrspace(1) noalias %out, i32 %val) # ; ; GISEL-LABEL: s_brev_i32: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: s_brev_b32 s2, s4 +; GISEL-NEXT: s_brev_b32 s2, s2 ; GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-NEXT: v_mov_b32_e32 v2, s2 ; GISEL-NEXT: v_mov_b32_e32 v1, s1 @@ -224,11 +224,11 @@ define amdgpu_kernel void @s_brev_i32(ptr addrspace(1) noalias %out, i32 %val) # ; GFX11-FLAT-LABEL: s_brev_i32: ; GFX11-FLAT: ; %bb.0: ; GFX11-FLAT-NEXT: s_clause 0x1 -; GFX11-FLAT-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-FLAT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-FLAT-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-FLAT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-FLAT-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FLAT-NEXT: s_brev_b32 s2, s4 +; GFX11-FLAT-NEXT: s_brev_b32 s2, s2 ; GFX11-FLAT-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FLAT-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-FLAT-NEXT: s_mov_b32 s2, -1 @@ -240,11 +240,11 @@ define amdgpu_kernel void @s_brev_i32(ptr addrspace(1) noalias %out, i32 %val) # ; GFX11-GISEL-LABEL: s_brev_i32: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_brev_b32 s2, s4 +; GFX11-GISEL-NEXT: s_brev_b32 s2, s2 ; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] @@ -259,7 +259,7 @@ define amdgpu_kernel void @s_brev_i32(ptr addrspace(1) noalias %out, i32 %val) # define amdgpu_kernel void @v_brev_i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) #0 { ; SI-LABEL: v_brev_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -278,7 +278,7 @@ define amdgpu_kernel void @v_brev_i32(ptr addrspace(1) noalias %out, ptr addrspa ; ; FLAT-LABEL: v_brev_i32: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; FLAT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) ; FLAT-NEXT: v_mov_b32_e32 v1, s3 @@ -294,7 +294,7 @@ define amdgpu_kernel void @v_brev_i32(ptr addrspace(1) noalias %out, ptr addrspa ; ; GISEL-LABEL: v_brev_i32: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -311,9 +311,7 @@ define amdgpu_kernel void @v_brev_i32(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX11-FLAT-LABEL: v_brev_i32: ; GFX11-FLAT: ; %bb.0: -; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-FLAT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FLAT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FLAT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLAT-NEXT: global_load_b32 v0, v0, s[2:3] @@ -328,10 +326,8 @@ define amdgpu_kernel void @v_brev_i32(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX11-GISEL-LABEL: v_brev_i32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) @@ -351,7 +347,7 @@ define amdgpu_kernel void @v_brev_i32(ptr addrspace(1) noalias %out, ptr addrspa define amdgpu_kernel void @s_brev_v2i32(ptr addrspace(1) noalias %out, <2 x i32> %val) #0 { ; SI-LABEL: s_brev_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -366,7 +362,7 @@ define amdgpu_kernel void @s_brev_v2i32(ptr addrspace(1) noalias %out, <2 x i32> ; ; FLAT-LABEL: s_brev_v2i32: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; FLAT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; FLAT-NEXT: s_mov_b32 s7, 0xf000 ; FLAT-NEXT: s_mov_b32 s6, -1 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) @@ -381,7 +377,7 @@ define amdgpu_kernel void @s_brev_v2i32(ptr addrspace(1) noalias %out, <2 x i32> ; ; GISEL-LABEL: s_brev_v2i32: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: s_brev_b32 s2, s2 ; GISEL-NEXT: s_brev_b32 s3, s3 @@ -394,7 +390,7 @@ define amdgpu_kernel void @s_brev_v2i32(ptr addrspace(1) noalias %out, <2 x i32> ; ; GFX11-FLAT-LABEL: s_brev_v2i32: ; GFX11-FLAT: ; %bb.0: -; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FLAT-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FLAT-NEXT: s_mov_b32 s6, -1 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0) @@ -411,7 +407,7 @@ define amdgpu_kernel void @s_brev_v2i32(ptr addrspace(1) noalias %out, <2 x i32> ; ; GFX11-GISEL-LABEL: s_brev_v2i32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_brev_b32 s2, s2 @@ -430,7 +426,7 @@ define amdgpu_kernel void @s_brev_v2i32(ptr addrspace(1) noalias %out, <2 x i32> define amdgpu_kernel void @v_brev_v2i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) #0 { ; SI-LABEL: v_brev_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -450,7 +446,7 @@ define amdgpu_kernel void @v_brev_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; FLAT-LABEL: v_brev_v2i32: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; FLAT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) ; FLAT-NEXT: v_mov_b32_e32 v1, s3 @@ -467,7 +463,7 @@ define amdgpu_kernel void @v_brev_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GISEL-LABEL: v_brev_v2i32: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -485,9 +481,7 @@ define amdgpu_kernel void @v_brev_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX11-FLAT-LABEL: v_brev_v2i32: ; GFX11-FLAT: ; %bb.0: -; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-FLAT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FLAT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FLAT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLAT-NEXT: global_load_b64 v[0:1], v0, s[2:3] @@ -503,11 +497,9 @@ define amdgpu_kernel void @v_brev_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX11-GISEL-LABEL: v_brev_v2i32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b64 v[0:1], v0, s[2:3] ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) @@ -528,7 +520,7 @@ define amdgpu_kernel void @v_brev_v2i32(ptr addrspace(1) noalias %out, ptr addrs define amdgpu_kernel void @s_brev_i64(ptr addrspace(1) noalias %out, i64 %val) #0 { ; SI-LABEL: s_brev_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -542,7 +534,7 @@ define amdgpu_kernel void @s_brev_i64(ptr addrspace(1) noalias %out, i64 %val) # ; ; FLAT-LABEL: s_brev_i64: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; FLAT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; FLAT-NEXT: s_mov_b32 s7, 0xf000 ; FLAT-NEXT: s_mov_b32 s6, -1 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) @@ -556,7 +548,7 @@ define amdgpu_kernel void @s_brev_i64(ptr addrspace(1) noalias %out, i64 %val) # ; ; GISEL-LABEL: s_brev_i64: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: s_brev_b64 s[2:3], s[2:3] ; GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -568,7 +560,7 @@ define amdgpu_kernel void @s_brev_i64(ptr addrspace(1) noalias %out, i64 %val) # ; ; GFX11-FLAT-LABEL: s_brev_i64: ; GFX11-FLAT: ; %bb.0: -; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLAT-NEXT: s_brev_b64 s[4:5], s[2:3] ; GFX11-FLAT-NEXT: s_mov_b32 s3, 0x31016000 @@ -581,7 +573,7 @@ define amdgpu_kernel void @s_brev_i64(ptr addrspace(1) noalias %out, i64 %val) # ; ; GFX11-GISEL-LABEL: s_brev_i64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_brev_b64 s[2:3], s[2:3] @@ -599,7 +591,7 @@ define amdgpu_kernel void @s_brev_i64(ptr addrspace(1) noalias %out, i64 %val) # define amdgpu_kernel void @v_brev_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) #0 { ; SI-LABEL: v_brev_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -619,7 +611,7 @@ define amdgpu_kernel void @v_brev_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; FLAT-LABEL: v_brev_i64: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; FLAT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) ; FLAT-NEXT: v_mov_b32_e32 v1, s3 @@ -636,7 +628,7 @@ define amdgpu_kernel void @v_brev_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; GISEL-LABEL: v_brev_i64: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -654,9 +646,7 @@ define amdgpu_kernel void @v_brev_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX11-FLAT-LABEL: v_brev_i64: ; GFX11-FLAT: ; %bb.0: -; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-FLAT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FLAT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FLAT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLAT-NEXT: global_load_b64 v[0:1], v0, s[2:3] @@ -672,9 +662,7 @@ define amdgpu_kernel void @v_brev_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX11-GISEL-LABEL: v_brev_i64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b64 v[0:1], v0, s[2:3] @@ -697,8 +685,8 @@ define amdgpu_kernel void @v_brev_i64(ptr addrspace(1) noalias %out, ptr addrspa define amdgpu_kernel void @s_brev_v2i64(ptr addrspace(1) noalias %out, <2 x i64> %val) #0 { ; SI-LABEL: s_brev_v2i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -713,8 +701,8 @@ define amdgpu_kernel void @s_brev_v2i64(ptr addrspace(1) noalias %out, <2 x i64> ; ; FLAT-LABEL: s_brev_v2i64: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; FLAT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; FLAT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; FLAT-NEXT: s_mov_b32 s3, 0xf000 ; FLAT-NEXT: s_mov_b32 s2, -1 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) @@ -729,8 +717,8 @@ define amdgpu_kernel void @s_brev_v2i64(ptr addrspace(1) noalias %out, <2 x i64> ; ; GISEL-LABEL: s_brev_v2i64: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: s_brev_b64 s[0:1], s[4:5] ; GISEL-NEXT: s_brev_b64 s[2:3], s[6:7] @@ -746,8 +734,8 @@ define amdgpu_kernel void @s_brev_v2i64(ptr addrspace(1) noalias %out, <2 x i64> ; GFX11-FLAT-LABEL: s_brev_v2i64: ; GFX11-FLAT: ; %bb.0: ; GFX11-FLAT-NEXT: s_clause 0x1 -; GFX11-FLAT-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11-FLAT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-FLAT-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-FLAT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLAT-NEXT: s_brev_b64 s[2:3], s[4:5] ; GFX11-FLAT-NEXT: s_brev_b64 s[4:5], s[6:7] @@ -763,8 +751,8 @@ define amdgpu_kernel void @s_brev_v2i64(ptr addrspace(1) noalias %out, <2 x i64> ; GFX11-GISEL-LABEL: s_brev_v2i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11-GISEL-NEXT: s_load_b64 s[8:9], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-GISEL-NEXT: s_load_b64 s[8:9], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_brev_b64 s[0:1], s[4:5] @@ -783,7 +771,7 @@ define amdgpu_kernel void @s_brev_v2i64(ptr addrspace(1) noalias %out, <2 x i64> define amdgpu_kernel void @v_brev_v2i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) #0 { ; SI-LABEL: v_brev_v2i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -805,7 +793,7 @@ define amdgpu_kernel void @v_brev_v2i64(ptr addrspace(1) noalias %out, ptr addrs ; ; FLAT-LABEL: v_brev_v2i64: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; FLAT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) ; FLAT-NEXT: v_mov_b32_e32 v1, s3 @@ -824,7 +812,7 @@ define amdgpu_kernel void @v_brev_v2i64(ptr addrspace(1) noalias %out, ptr addrs ; ; GISEL-LABEL: v_brev_v2i64: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -844,9 +832,7 @@ define amdgpu_kernel void @v_brev_v2i64(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX11-FLAT-LABEL: v_brev_v2i64: ; GFX11-FLAT: ; %bb.0: -; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-FLAT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FLAT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FLAT-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLAT-NEXT: global_load_b128 v[0:3], v0, s[2:3] @@ -864,9 +850,7 @@ define amdgpu_kernel void @v_brev_v2i64(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX11-GISEL-LABEL: v_brev_v2i64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b128 v[0:3], v0, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/build_vector.ll b/llvm/test/CodeGen/AMDGPU/build_vector.ll index 8293280609517..e914635d6c26f 100644 --- a/llvm/test/CodeGen/AMDGPU/build_vector.ll +++ b/llvm/test/CodeGen/AMDGPU/build_vector.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @build_vector2 (ptr addrspace(1) %out) { ; GFX6-LABEL: build_vector2: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, 5 @@ -19,7 +19,7 @@ define amdgpu_kernel void @build_vector2 (ptr addrspace(1) %out) { ; ; GFX8-LABEL: build_vector2: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v0, 5 ; GFX8-NEXT: v_mov_b32_e32 v1, 6 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -30,7 +30,7 @@ define amdgpu_kernel void @build_vector2 (ptr addrspace(1) %out) { ; ; GFX10-LABEL: build_vector2: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 5 ; GFX10-NEXT: v_mov_b32_e32 v1, 6 @@ -40,7 +40,7 @@ define amdgpu_kernel void @build_vector2 (ptr addrspace(1) %out) { ; ; GFX11-LABEL: build_vector2: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: v_mov_b32_e32 v0, 5 ; GFX11-NEXT: v_mov_b32_e32 v1, 6 @@ -52,7 +52,7 @@ define amdgpu_kernel void @build_vector2 (ptr addrspace(1) %out) { ; ; GFX940-LABEL: build_vector2: ; GFX940: ; %bb.0: ; %entry -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: v_mov_b32_e32 v0, 5 ; GFX940-NEXT: v_mov_b32_e32 v1, 6 @@ -67,7 +67,7 @@ entry: define amdgpu_kernel void @build_vector4 (ptr addrspace(1) %out) { ; GFX6-LABEL: build_vector4: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, 5 @@ -80,7 +80,7 @@ define amdgpu_kernel void @build_vector4 (ptr addrspace(1) %out) { ; ; GFX8-LABEL: build_vector4: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v0, 5 ; GFX8-NEXT: v_mov_b32_e32 v1, 6 ; GFX8-NEXT: v_mov_b32_e32 v2, 7 @@ -93,7 +93,7 @@ define amdgpu_kernel void @build_vector4 (ptr addrspace(1) %out) { ; ; GFX10-LABEL: build_vector4: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 5 ; GFX10-NEXT: v_mov_b32_e32 v1, 6 @@ -105,7 +105,7 @@ define amdgpu_kernel void @build_vector4 (ptr addrspace(1) %out) { ; ; GFX11-LABEL: build_vector4: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: v_mov_b32_e32 v0, 5 ; GFX11-NEXT: v_mov_b32_e32 v1, 6 @@ -119,7 +119,7 @@ define amdgpu_kernel void @build_vector4 (ptr addrspace(1) %out) { ; ; GFX940-LABEL: build_vector4: ; GFX940: ; %bb.0: ; %entry -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v4, 0 ; GFX940-NEXT: v_mov_b32_e32 v0, 5 ; GFX940-NEXT: v_mov_b32_e32 v1, 6 @@ -136,7 +136,7 @@ entry: define amdgpu_kernel void @build_vector_v2i16 (ptr addrspace(1) %out) { ; GFX6-LABEL: build_vector_v2i16: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, 0x60005 @@ -146,7 +146,7 @@ define amdgpu_kernel void @build_vector_v2i16 (ptr addrspace(1) %out) { ; ; GFX8-LABEL: build_vector_v2i16: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x60005 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -156,7 +156,7 @@ define amdgpu_kernel void @build_vector_v2i16 (ptr addrspace(1) %out) { ; ; GFX10-LABEL: build_vector_v2i16: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x60005 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -165,7 +165,7 @@ define amdgpu_kernel void @build_vector_v2i16 (ptr addrspace(1) %out) { ; ; GFX11-LABEL: build_vector_v2i16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0x60005 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -176,7 +176,7 @@ define amdgpu_kernel void @build_vector_v2i16 (ptr addrspace(1) %out) { ; ; GFX940-LABEL: build_vector_v2i16: ; GFX940: ; %bb.0: ; %entry -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NEXT: v_mov_b32_e32 v1, 0x60005 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) @@ -190,8 +190,8 @@ entry: define amdgpu_kernel void @build_vector_v2i16_trunc (ptr addrspace(1) %out, i32 %a) { ; GFX6-LABEL: build_vector_v2i16_trunc: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -201,10 +201,10 @@ define amdgpu_kernel void @build_vector_v2i16_trunc (ptr addrspace(1) %out, i32 ; ; GFX8-LABEL: build_vector_v2i16_trunc: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshr_b32 s2, s4, 16 +; GFX8-NEXT: s_lshr_b32 s2, s2, 16 ; GFX8-NEXT: s_or_b32 s2, s2, 0x50000 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -215,11 +215,11 @@ define amdgpu_kernel void @build_vector_v2i16_trunc (ptr addrspace(1) %out, i32 ; GFX10-LABEL: build_vector_v2i16_trunc: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_lshr_b32 s2, s4, 16 +; GFX10-NEXT: s_lshr_b32 s2, s2, 16 ; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, 5 ; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] @@ -228,11 +228,11 @@ define amdgpu_kernel void @build_vector_v2i16_trunc (ptr addrspace(1) %out, i32 ; GFX11-LABEL: build_vector_v2i16_trunc: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_pack_hl_b32_b16 s2, s4, 5 +; GFX11-NEXT: s_pack_hl_b32_b16 s2, s2, 5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -242,14 +242,14 @@ define amdgpu_kernel void @build_vector_v2i16_trunc (ptr addrspace(1) %out, i32 ; ; GFX940-LABEL: build_vector_v2i16_trunc: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_lshr_b32 s2, s4, 16 -; GFX940-NEXT: s_pack_ll_b32_b16 s2, s2, 5 -; GFX940-NEXT: v_mov_b32_e32 v1, s2 -; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_lshr_b32 s0, s4, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, 5 +; GFX940-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NEXT: s_endpgm %srl = lshr i32 %a, 16 %trunc = trunc i32 %srl to i16 @@ -262,7 +262,7 @@ define amdgpu_kernel void @build_vector_v2i16_trunc (ptr addrspace(1) %out, i32 define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out, <4 x i16> %in) { ; GFX6-LABEL: build_v2i32_from_v4i16_shuffle: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -277,7 +277,7 @@ define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out, ; ; GFX8-LABEL: build_v2i32_from_v4i16_shuffle: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshl_b32 s3, s3, 16 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 @@ -290,7 +290,7 @@ define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out, ; ; GFX10-LABEL: build_v2i32_from_v4i16_shuffle: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_lshl_b32 s2, s2, 16 @@ -302,7 +302,7 @@ define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out, ; ; GFX11-LABEL: build_v2i32_from_v4i16_shuffle: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshl_b32 s2, s2, 16 @@ -316,14 +316,14 @@ define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out, ; ; GFX940-LABEL: build_v2i32_from_v4i16_shuffle: ; GFX940: ; %bb.0: ; %entry -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_lshl_b32 s3, s3, 16 -; GFX940-NEXT: s_lshl_b32 s2, s2, 16 -; GFX940-NEXT: v_mov_b32_e32 v0, s2 -; GFX940-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_lshl_b32 s0, s7, 16 +; GFX940-NEXT: s_lshl_b32 s1, s6, 16 +; GFX940-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] sc0 sc1 ; GFX940-NEXT: s_endpgm entry: %shuf = shufflevector <4 x i16> %in, <4 x i16> zeroinitializer, <2 x i32> diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll index 231d3d97c8f4f..15ebdd70ae881 100644 --- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll +++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll @@ -8,7 +8,7 @@ define spir_kernel void @kernel(ptr addrspace(1) %out) { ; SI-LABEL: kernel: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -18,7 +18,7 @@ define spir_kernel void @kernel(ptr addrspace(1) %out) { ; ; VI-LABEL: kernel: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -28,7 +28,7 @@ define spir_kernel void @kernel(ptr addrspace(1) %out) { ; ; GFX11-LABEL: kernel: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v0, s[0:1] @@ -115,32 +115,21 @@ define amdgpu_kernel void @call_coldcc() #0 { ; SI-LABEL: call_coldcc: ; SI: ; %bb.0: ; SI-NEXT: s_mov_b32 s32, 0 -; SI-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 -; SI-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 -; SI-NEXT: s_mov_b32 s22, -1 -; SI-NEXT: s_mov_b32 s23, 0xe8f000 -; SI-NEXT: s_add_u32 s20, s20, s9 -; SI-NEXT: s_addc_u32 s21, s21, 0 -; SI-NEXT: s_mov_b32 s14, s8 -; SI-NEXT: s_mov_b64 s[10:11], s[4:5] -; SI-NEXT: s_add_u32 s8, s2, 36 -; SI-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; SI-NEXT: s_addc_u32 s9, s3, 0 -; SI-NEXT: s_getpc_b64 s[2:3] -; SI-NEXT: s_add_u32 s2, s2, coldcc@gotpcrel32@lo+4 -; SI-NEXT: s_addc_u32 s3, s3, coldcc@gotpcrel32@hi+12 -; SI-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_or_b32_e32 v31, v0, v2 +; SI-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; SI-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s11, 0xe8f000 +; SI-NEXT: s_add_u32 s8, s8, s1 +; SI-NEXT: s_addc_u32 s9, s9, 0 +; SI-NEXT: s_getpc_b64 s[0:1] +; SI-NEXT: s_add_u32 s0, s0, coldcc@gotpcrel32@lo+4 +; SI-NEXT: s_addc_u32 s1, s1, coldcc@gotpcrel32@hi+12 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SI-NEXT: v_mov_b32_e32 v0, 1.0 -; SI-NEXT: s_mov_b64 s[4:5], s[0:1] -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b64 s[0:1], s[20:21] -; SI-NEXT: s_mov_b64 s[2:3], s[22:23] +; SI-NEXT: s_mov_b64 s[0:1], s[8:9] +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_swappc_b64 s[30:31], s[16:17] +; SI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -152,49 +141,31 @@ define amdgpu_kernel void @call_coldcc() #0 { ; VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; VI-NEXT: s_mov_b32 s90, -1 ; VI-NEXT: s_mov_b32 s91, 0xe80000 -; VI-NEXT: s_add_u32 s88, s88, s9 +; VI-NEXT: s_add_u32 s88, s88, s1 ; VI-NEXT: s_addc_u32 s89, s89, 0 -; VI-NEXT: s_mov_b32 s14, s8 -; VI-NEXT: s_add_u32 s8, s2, 36 -; VI-NEXT: s_addc_u32 s9, s3, 0 -; VI-NEXT: s_getpc_b64 s[2:3] -; VI-NEXT: s_add_u32 s2, s2, coldcc@gotpcrel32@lo+4 -; VI-NEXT: s_addc_u32 s3, s3, coldcc@gotpcrel32@hi+12 -; VI-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; VI-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; VI-NEXT: s_mov_b64 s[10:11], s[4:5] -; VI-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: s_mov_b64 s[4:5], s[0:1] +; VI-NEXT: s_getpc_b64 s[0:1] +; VI-NEXT: s_add_u32 s0, s0, coldcc@gotpcrel32@lo+4 +; VI-NEXT: s_addc_u32 s1, s1, coldcc@gotpcrel32@hi+12 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; VI-NEXT: s_mov_b64 s[0:1], s[88:89] -; VI-NEXT: v_or_b32_e32 v31, v0, v2 -; VI-NEXT: s_mov_b32 s12, s6 -; VI-NEXT: s_mov_b32 s13, s7 ; VI-NEXT: s_mov_b64 s[2:3], s[90:91] ; VI-NEXT: v_mov_b32_e32 v0, 1.0 ; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_swappc_b64 s[30:31], s[16:17] +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: flat_store_dword v[0:1], v0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: call_coldcc: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_add_u32 s8, s2, 36 -; GFX11-NEXT: s_addc_u32 s9, s3, 0 -; GFX11-NEXT: s_getpc_b64 s[2:3] -; GFX11-NEXT: s_add_u32 s2, s2, coldcc@gotpcrel32@lo+4 -; GFX11-NEXT: s_addc_u32 s3, s3, coldcc@gotpcrel32@hi+12 -; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 1.0 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX11-NEXT: s_mov_b32 s12, s13 -; GFX11-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX11-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX11-NEXT: s_mov_b32 s13, s14 -; GFX11-NEXT: s_mov_b32 s14, s15 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, coldcc@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, coldcc@gotpcrel32@hi+12 +; GFX11-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: global_store_b32 v[0:1], v0, off ; GFX11-NEXT: s_endpgm %val = call float @coldcc(float 1.0) @@ -206,32 +177,21 @@ define amdgpu_kernel void @call_fastcc() #0 { ; SI-LABEL: call_fastcc: ; SI: ; %bb.0: ; SI-NEXT: s_mov_b32 s32, 0 -; SI-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 -; SI-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 -; SI-NEXT: s_mov_b32 s22, -1 -; SI-NEXT: s_mov_b32 s23, 0xe8f000 -; SI-NEXT: s_add_u32 s20, s20, s9 -; SI-NEXT: s_addc_u32 s21, s21, 0 -; SI-NEXT: s_mov_b32 s14, s8 -; SI-NEXT: s_mov_b64 s[10:11], s[4:5] -; SI-NEXT: s_add_u32 s8, s2, 36 -; SI-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; SI-NEXT: s_addc_u32 s9, s3, 0 -; SI-NEXT: s_getpc_b64 s[2:3] -; SI-NEXT: s_add_u32 s2, s2, fastcc@gotpcrel32@lo+4 -; SI-NEXT: s_addc_u32 s3, s3, fastcc@gotpcrel32@hi+12 -; SI-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_or_b32_e32 v31, v0, v2 +; SI-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; SI-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s11, 0xe8f000 +; SI-NEXT: s_add_u32 s8, s8, s1 +; SI-NEXT: s_addc_u32 s9, s9, 0 +; SI-NEXT: s_getpc_b64 s[0:1] +; SI-NEXT: s_add_u32 s0, s0, fastcc@gotpcrel32@lo+4 +; SI-NEXT: s_addc_u32 s1, s1, fastcc@gotpcrel32@hi+12 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SI-NEXT: v_mov_b32_e32 v0, 1.0 -; SI-NEXT: s_mov_b64 s[4:5], s[0:1] -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b64 s[0:1], s[20:21] -; SI-NEXT: s_mov_b64 s[2:3], s[22:23] +; SI-NEXT: s_mov_b64 s[0:1], s[8:9] +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_swappc_b64 s[30:31], s[16:17] +; SI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -243,49 +203,31 @@ define amdgpu_kernel void @call_fastcc() #0 { ; VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; VI-NEXT: s_mov_b32 s90, -1 ; VI-NEXT: s_mov_b32 s91, 0xe80000 -; VI-NEXT: s_add_u32 s88, s88, s9 +; VI-NEXT: s_add_u32 s88, s88, s1 ; VI-NEXT: s_addc_u32 s89, s89, 0 -; VI-NEXT: s_mov_b32 s14, s8 -; VI-NEXT: s_add_u32 s8, s2, 36 -; VI-NEXT: s_addc_u32 s9, s3, 0 -; VI-NEXT: s_getpc_b64 s[2:3] -; VI-NEXT: s_add_u32 s2, s2, fastcc@gotpcrel32@lo+4 -; VI-NEXT: s_addc_u32 s3, s3, fastcc@gotpcrel32@hi+12 -; VI-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; VI-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; VI-NEXT: s_mov_b64 s[10:11], s[4:5] -; VI-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: s_mov_b64 s[4:5], s[0:1] +; VI-NEXT: s_getpc_b64 s[0:1] +; VI-NEXT: s_add_u32 s0, s0, fastcc@gotpcrel32@lo+4 +; VI-NEXT: s_addc_u32 s1, s1, fastcc@gotpcrel32@hi+12 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; VI-NEXT: s_mov_b64 s[0:1], s[88:89] -; VI-NEXT: v_or_b32_e32 v31, v0, v2 -; VI-NEXT: s_mov_b32 s12, s6 -; VI-NEXT: s_mov_b32 s13, s7 ; VI-NEXT: s_mov_b64 s[2:3], s[90:91] ; VI-NEXT: v_mov_b32_e32 v0, 1.0 ; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_swappc_b64 s[30:31], s[16:17] +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: flat_store_dword v[0:1], v0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: call_fastcc: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_add_u32 s8, s2, 36 -; GFX11-NEXT: s_addc_u32 s9, s3, 0 -; GFX11-NEXT: s_getpc_b64 s[2:3] -; GFX11-NEXT: s_add_u32 s2, s2, fastcc@gotpcrel32@lo+4 -; GFX11-NEXT: s_addc_u32 s3, s3, fastcc@gotpcrel32@hi+12 -; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 1.0 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX11-NEXT: s_mov_b32 s12, s13 -; GFX11-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX11-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX11-NEXT: s_mov_b32 s13, s14 -; GFX11-NEXT: s_mov_b32 s14, s15 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, fastcc@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, fastcc@gotpcrel32@hi+12 +; GFX11-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: global_store_b32 v[0:1], v0, off ; GFX11-NEXT: s_endpgm %val = call float @fastcc(float 1.0) @@ -1012,7 +954,7 @@ define amdgpu_ps i16 @ret_ps_mesa_i16() { define amdgpu_kernel void @amd_kernel_i8(i8 %arg0) { ; SI-LABEL: amd_kernel_i8: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s0, s[2:3], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_add_i32 s0, s0, s0 @@ -1023,7 +965,7 @@ define amdgpu_kernel void @amd_kernel_i8(i8 %arg0) { ; ; VI-LABEL: amd_kernel_i8: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s0, s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_i32 s0, s0, s0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1032,7 +974,7 @@ define amdgpu_kernel void @amd_kernel_i8(i8 %arg0) { ; ; GFX11-LABEL: amd_kernel_i8: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_i32 s0, s0, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -1050,7 +992,7 @@ entry: define amdgpu_kernel void @amd_kernel_v2i8(<2 x i8> %arg0) { ; SI-LABEL: amd_kernel_v2i8: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s1, s[2:3], 0x9 +; SI-NEXT: s_load_dword s1, s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s0, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1068,7 +1010,7 @@ define amdgpu_kernel void @amd_kernel_v2i8(<2 x i8> %arg0) { ; ; VI-LABEL: amd_kernel_v2i8: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s0, s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_add_i32 s0, s0, s0 @@ -1082,7 +1024,7 @@ define amdgpu_kernel void @amd_kernel_v2i8(<2 x i8> %arg0) { ; ; GFX11-LABEL: amd_kernel_v2i8: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_lshrrev_b16 v0, 8, s0 ; GFX11-NEXT: v_add_nc_u16 v1, s0, s0 @@ -1107,7 +1049,7 @@ entry: define amdgpu_kernel void @amd_kernel_v4i8(<4 x i8> %arg0) { ; SI-LABEL: amd_kernel_v4i8: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s1, s[2:3], 0x9 +; SI-NEXT: s_load_dword s1, s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s0, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1135,7 +1077,7 @@ define amdgpu_kernel void @amd_kernel_v4i8(<4 x i8> %arg0) { ; ; VI-LABEL: amd_kernel_v4i8: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s0, s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s1, s0, 24 ; VI-NEXT: s_lshr_b32 s2, s0, 16 @@ -1157,7 +1099,7 @@ define amdgpu_kernel void @amd_kernel_v4i8(<4 x i8> %arg0) { ; ; GFX11-LABEL: amd_kernel_v4i8: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_lshrrev_b16 v0, 8, s0 ; GFX11-NEXT: s_lshr_b32 s1, s0, 16 @@ -1194,7 +1136,7 @@ entry: define amdgpu_kernel void @amd_kernel_v3i8(<3 x i8> %arg0) { ; SI-LABEL: amd_kernel_v3i8: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s1, 0 ; SI-NEXT: s_mov_b32 s0, 2 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -1218,7 +1160,7 @@ define amdgpu_kernel void @amd_kernel_v3i8(<3 x i8> %arg0) { ; ; VI-LABEL: amd_kernel_v3i8: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s0, s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: v_mov_b32_e32 v3, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1238,7 +1180,7 @@ define amdgpu_kernel void @amd_kernel_v3i8(<3 x i8> %arg0) { ; ; GFX11-LABEL: amd_kernel_v3i8: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1270,7 +1212,7 @@ entry: define amdgpu_kernel void @amd_kernel_v5i8(<5 x i8> %arg0) { ; SI-LABEL: amd_kernel_v5i8: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s1, 0 ; SI-NEXT: s_mov_b32 s0, 4 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -1303,7 +1245,7 @@ define amdgpu_kernel void @amd_kernel_v5i8(<5 x i8> %arg0) { ; ; VI-LABEL: amd_kernel_v5i8: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s2, s0, 24 ; VI-NEXT: s_lshr_b32 s3, s0, 16 @@ -1331,7 +1273,7 @@ define amdgpu_kernel void @amd_kernel_v5i8(<5 x i8> %arg0) { ; ; GFX11-LABEL: amd_kernel_v5i8: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_lshrrev_b16 v0, 8, s0 ; GFX11-NEXT: s_lshr_b32 s2, s0, 16 @@ -1370,7 +1312,7 @@ entry: define amdgpu_kernel void @amd_kernel_v8i8(<8 x i8> %arg0) { ; SI-LABEL: amd_kernel_v8i8: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s0, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1415,7 +1357,7 @@ define amdgpu_kernel void @amd_kernel_v8i8(<8 x i8> %arg0) { ; ; VI-LABEL: amd_kernel_v8i8: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s2, s1, 24 ; VI-NEXT: s_lshr_b32 s3, s1, 16 @@ -1450,7 +1392,7 @@ define amdgpu_kernel void @amd_kernel_v8i8(<8 x i8> %arg0) { ; ; GFX11-LABEL: amd_kernel_v8i8: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_lshrrev_b16 v0, 8, s0 ; GFX11-NEXT: v_lshrrev_b16 v1, 8, s1 @@ -1503,7 +1445,7 @@ entry: define amdgpu_kernel void @amd_kernel_v16i8(<16 x i8> %arg0) { ; SI-LABEL: amd_kernel_v16i8: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s4, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1582,7 +1524,7 @@ define amdgpu_kernel void @amd_kernel_v16i8(<16 x i8> %arg0) { ; ; VI-LABEL: amd_kernel_v16i8: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s4, s3, 24 ; VI-NEXT: s_lshr_b32 s5, s3, 16 @@ -1643,7 +1585,7 @@ define amdgpu_kernel void @amd_kernel_v16i8(<16 x i8> %arg0) { ; ; GFX11-LABEL: amd_kernel_v16i8: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshr_b32 s10, s3, 16 ; GFX11-NEXT: s_lshr_b32 s11, s3, 24 @@ -1724,7 +1666,7 @@ entry: define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) { ; SI-LABEL: amd_kernel_v32i8: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s9, 0 ; SI-NEXT: s_mov_b32 s8, 16 ; SI-NEXT: s_mov_b32 s11, 0xf000 @@ -1874,7 +1816,7 @@ define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) { ; ; VI-LABEL: amd_kernel_v32i8: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v10, 0 ; VI-NEXT: v_mov_b32_e32 v11, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1990,7 +1932,7 @@ define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) { ; ; GFX11-LABEL: amd_kernel_v32i8: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_lshrrev_b16 v3, 8, s2 ; GFX11-NEXT: v_lshrrev_b16 v7, 8, s3 diff --git a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll index fad1d47f55fd7..d511bb1f4a257 100644 --- a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll +++ b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll @@ -30,7 +30,7 @@ define amdgpu_kernel void @cluster_load_cluster_store(ptr noalias %lb, ptr noalias %sb) { ; GFX9-LABEL: cluster_load_cluster_store: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -49,20 +49,20 @@ define amdgpu_kernel void @cluster_load_cluster_store(ptr noalias %lb, ptr noali ; ; GFX10-LABEL: cluster_load_cluster_store: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_u32 s4, s0, 8 -; GFX10-NEXT: s_addc_u32 s5, s1, 0 -; GFX10-NEXT: s_add_u32 s6, s0, 16 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: s_addc_u32 s7, s1, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: s_add_u32 s0, s0, 24 -; GFX10-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: v_mov_b32_e32 v3, s5 -; GFX10-NEXT: v_mov_b32_e32 v4, s6 -; GFX10-NEXT: v_mov_b32_e32 v5, s7 +; GFX10-NEXT: s_add_u32 s0, s4, 8 +; GFX10-NEXT: s_addc_u32 s1, s5, 0 +; GFX10-NEXT: s_add_u32 s2, s4, 16 +; GFX10-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-NEXT: s_addc_u32 s3, s5, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: s_add_u32 s0, s4, 24 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: s_addc_u32 s1, s5, 0 +; GFX10-NEXT: v_mov_b32_e32 v5, s3 +; GFX10-NEXT: v_mov_b32_e32 v4, s2 ; GFX10-NEXT: v_mov_b32_e32 v7, s1 ; GFX10-NEXT: v_mov_b32_e32 v6, s0 ; GFX10-NEXT: s_clause 0x3 @@ -70,16 +70,16 @@ define amdgpu_kernel void @cluster_load_cluster_store(ptr noalias %lb, ptr noali ; GFX10-NEXT: flat_load_dword v9, v[2:3] ; GFX10-NEXT: flat_load_dword v10, v[4:5] ; GFX10-NEXT: flat_load_dword v11, v[6:7] -; GFX10-NEXT: s_add_u32 s0, s2, 8 -; GFX10-NEXT: s_addc_u32 s1, s3, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: s_add_u32 s0, s6, 8 +; GFX10-NEXT: s_addc_u32 s1, s7, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: s_add_u32 s0, s2, 16 -; GFX10-NEXT: s_addc_u32 s1, s3, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: s_add_u32 s2, s2, 24 -; GFX10-NEXT: s_addc_u32 s3, s3, 0 +; GFX10-NEXT: s_add_u32 s0, s6, 16 +; GFX10-NEXT: s_addc_u32 s1, s7, 0 +; GFX10-NEXT: s_add_u32 s2, s6, 24 +; GFX10-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-NEXT: s_addc_u32 s3, s7, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, s1 ; GFX10-NEXT: v_mov_b32_e32 v4, s0 ; GFX10-NEXT: v_mov_b32_e32 v7, s3 @@ -96,7 +96,7 @@ define amdgpu_kernel void @cluster_load_cluster_store(ptr noalias %lb, ptr noali ; ; GFX11-LABEL: cluster_load_cluster_store: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_clause 0x3 @@ -155,7 +155,7 @@ bb: define amdgpu_kernel void @cluster_load_valu_cluster_store(ptr noalias %lb, ptr noalias %sb) { ; GFX9-LABEL: cluster_load_valu_cluster_store: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -175,20 +175,20 @@ define amdgpu_kernel void @cluster_load_valu_cluster_store(ptr noalias %lb, ptr ; ; GFX10-LABEL: cluster_load_valu_cluster_store: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_u32 s4, s0, 8 -; GFX10-NEXT: s_addc_u32 s5, s1, 0 -; GFX10-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-NEXT: s_add_u32 s6, s0, 16 -; GFX10-NEXT: v_mov_b32_e32 v3, s5 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: s_addc_u32 s7, s1, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: s_add_u32 s0, s0, 24 -; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: v_mov_b32_e32 v4, s6 -; GFX10-NEXT: v_mov_b32_e32 v5, s7 +; GFX10-NEXT: s_add_u32 s0, s4, 8 +; GFX10-NEXT: s_addc_u32 s1, s5, 0 +; GFX10-NEXT: s_add_u32 s2, s4, 16 +; GFX10-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: s_addc_u32 s3, s5, 0 +; GFX10-NEXT: s_add_u32 s0, s4, 24 +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: s_addc_u32 s1, s5, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: v_mov_b32_e32 v5, s3 +; GFX10-NEXT: v_mov_b32_e32 v4, s2 ; GFX10-NEXT: flat_load_dword v6, v[2:3] ; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 @@ -196,18 +196,18 @@ define amdgpu_kernel void @cluster_load_valu_cluster_store(ptr noalias %lb, ptr ; GFX10-NEXT: flat_load_dword v8, v[0:1] ; GFX10-NEXT: flat_load_dword v9, v[4:5] ; GFX10-NEXT: flat_load_dword v10, v[2:3] -; GFX10-NEXT: s_add_u32 s0, s2, 8 -; GFX10-NEXT: s_addc_u32 s1, s3, 0 -; GFX10-NEXT: s_add_u32 s4, s2, 16 +; GFX10-NEXT: s_add_u32 s0, s6, 8 +; GFX10-NEXT: s_addc_u32 s1, s7, 0 +; GFX10-NEXT: s_add_u32 s2, s6, 16 ; GFX10-NEXT: v_mov_b32_e32 v3, s1 -; GFX10-NEXT: s_addc_u32 s5, s3, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: s_addc_u32 s3, s7, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: s_add_u32 s0, s2, 24 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: s_addc_u32 s1, s3, 0 -; GFX10-NEXT: v_mov_b32_e32 v5, s5 +; GFX10-NEXT: s_add_u32 s0, s6, 24 +; GFX10-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-NEXT: v_mov_b32_e32 v5, s3 +; GFX10-NEXT: s_addc_u32 s1, s7, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, s2 ; GFX10-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) ; GFX10-NEXT: v_add_nc_u32_e32 v11, 1, v6 ; GFX10-NEXT: v_mov_b32_e32 v7, s1 @@ -223,7 +223,7 @@ define amdgpu_kernel void @cluster_load_valu_cluster_store(ptr noalias %lb, ptr ; ; GFX11-LABEL: cluster_load_valu_cluster_store: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_clause 0x3 diff --git a/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll b/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll index df223b3ec1354..9c7fa1537c0c2 100644 --- a/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll +++ b/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @add1(ptr addrspace(1) nocapture %arg) { ; GCN-LABEL: add1: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 @@ -20,7 +20,7 @@ define amdgpu_kernel void @add1(ptr addrspace(1) nocapture %arg) { ; ; GFX9-LABEL: add1: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -88,7 +88,7 @@ bb: define amdgpu_kernel void @sub1(ptr addrspace(1) nocapture %arg) { ; GCN-LABEL: sub1: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 @@ -103,7 +103,7 @@ define amdgpu_kernel void @sub1(ptr addrspace(1) nocapture %arg) { ; ; GFX9-LABEL: sub1: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -127,33 +127,33 @@ bb: define amdgpu_kernel void @add_adde(ptr addrspace(1) nocapture %arg, i32 %a) { ; GCN-LABEL: add_adde: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, 0 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s0, s[0:1], 0xb +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v4, v[2:3], s[0:3], 0 addr64 -; GCN-NEXT: v_mov_b32_e32 v5, s4 +; GCN-NEXT: buffer_load_dword v4, v[2:3], s[4:7], 0 addr64 +; GCN-NEXT: v_mov_b32_e32 v5, s0 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_addc_u32_e32 v0, vcc, v5, v4, vcc -; GCN-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 +; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 ; GCN-NEXT: s_endpgm ; ; GFX9-LABEL: add_adde: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v3, v2, s[0:1] +; GFX9-NEXT: global_load_dword v3, v2, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v4, v3, vcc -; GFX9-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9-NEXT: global_store_dword v2, v0, s[2:3] ; GFX9-NEXT: s_endpgm bb: %x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -171,33 +171,33 @@ bb: define amdgpu_kernel void @adde_add(ptr addrspace(1) nocapture %arg, i32 %a) { ; GCN-LABEL: adde_add: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, 0 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s0, s[0:1], 0xb +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v4, v[2:3], s[0:3], 0 addr64 -; GCN-NEXT: v_mov_b32_e32 v5, s4 +; GCN-NEXT: buffer_load_dword v4, v[2:3], s[4:7], 0 addr64 +; GCN-NEXT: v_mov_b32_e32 v5, s0 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_addc_u32_e32 v0, vcc, v4, v5, vcc -; GCN-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 +; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 ; GCN-NEXT: s_endpgm ; ; GFX9-LABEL: adde_add: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v3, v2, s[0:1] +; GFX9-NEXT: global_load_dword v3, v2, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v4, vcc -; GFX9-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9-NEXT: global_store_dword v2, v0, s[2:3] ; GFX9-NEXT: s_endpgm bb: %x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -215,33 +215,33 @@ bb: define amdgpu_kernel void @sub_sube(ptr addrspace(1) nocapture %arg, i32 %a) { ; GCN-LABEL: sub_sube: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, 0 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s0, s[0:1], 0xb +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v4, v[2:3], s[0:3], 0 addr64 -; GCN-NEXT: v_mov_b32_e32 v5, s4 +; GCN-NEXT: buffer_load_dword v4, v[2:3], s[4:7], 0 addr64 +; GCN-NEXT: v_mov_b32_e32 v5, s0 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_subb_u32_e32 v0, vcc, v4, v5, vcc -; GCN-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 +; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 ; GCN-NEXT: s_endpgm ; ; GFX9-LABEL: sub_sube: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v3, v2, s[0:1] +; GFX9-NEXT: global_load_dword v3, v2, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v3, v4, vcc -; GFX9-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9-NEXT: global_store_dword v2, v0, s[2:3] ; GFX9-NEXT: s_endpgm bb: %x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -259,35 +259,35 @@ bb: define amdgpu_kernel void @sub_sube_commuted(ptr addrspace(1) nocapture %arg, i32 %a) { ; GCN-LABEL: sub_sube_commuted: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, 0 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s0, s[0:1], 0xb +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v4, v[2:3], s[0:3], 0 addr64 +; GCN-NEXT: buffer_load_dword v4, v[2:3], s[4:7], 0 addr64 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v4, vcc -; GCN-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 ; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 -; GCN-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 +; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 ; GCN-NEXT: s_endpgm ; ; GFX9-LABEL: sub_sube_commuted: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v3, v2, s[0:1] +; GFX9-NEXT: global_load_dword v3, v2, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_subbrev_co_u32_e32 v0, vcc, 0, v3, vcc ; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x64, v0 -; GFX9-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9-NEXT: global_store_dword v2, v0, s[2:3] ; GFX9-NEXT: s_endpgm bb: %x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -306,33 +306,33 @@ bb: define amdgpu_kernel void @sube_sub(ptr addrspace(1) nocapture %arg, i32 %a) { ; GCN-LABEL: sube_sub: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, 0 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s0, s[0:1], 0xb +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v4, v[2:3], s[0:3], 0 addr64 -; GCN-NEXT: v_mov_b32_e32 v5, s4 +; GCN-NEXT: buffer_load_dword v4, v[2:3], s[4:7], 0 addr64 +; GCN-NEXT: v_mov_b32_e32 v5, s0 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_subb_u32_e32 v0, vcc, v4, v5, vcc -; GCN-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 +; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 ; GCN-NEXT: s_endpgm ; ; GFX9-LABEL: sube_sub: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v3, v2, s[0:1] +; GFX9-NEXT: global_load_dword v3, v2, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v3, v4, vcc -; GFX9-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9-NEXT: global_store_dword v2, v0, s[2:3] ; GFX9-NEXT: s_endpgm bb: %x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -350,33 +350,33 @@ bb: define amdgpu_kernel void @zext_flclass(ptr addrspace(1) nocapture %arg, float %x) { ; GCN-LABEL: zext_flclass: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, 0 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s0, s[0:1], 0xb +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 +; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 ; GCN-NEXT: v_mov_b32_e32 v3, 0x260 -; GCN-NEXT: v_cmp_class_f32_e32 vcc, s4, v3 +; GCN-NEXT: v_cmp_class_f32_e32 vcc, s0, v3 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; GCN-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; GCN-NEXT: s_endpgm ; ; GFX9-LABEL: zext_flclass: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x260 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: v_cmp_class_f32_e32 vcc, s4, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm bb: %id = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -392,33 +392,33 @@ bb: define amdgpu_kernel void @sext_flclass(ptr addrspace(1) nocapture %arg, float %x) { ; GCN-LABEL: sext_flclass: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, 0 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s0, s[0:1], 0xb +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 +; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 ; GCN-NEXT: v_mov_b32_e32 v3, 0x260 -; GCN-NEXT: v_cmp_class_f32_e32 vcc, s4, v3 +; GCN-NEXT: v_cmp_class_f32_e32 vcc, s0, v3 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; GCN-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; GCN-NEXT: s_endpgm ; ; GFX9-LABEL: sext_flclass: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x260 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: v_cmp_class_f32_e32 vcc, s4, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm bb: %id = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -434,7 +434,7 @@ bb: define amdgpu_kernel void @add_and(ptr addrspace(1) nocapture %arg) { ; GCN-LABEL: add_and: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 @@ -450,7 +450,7 @@ define amdgpu_kernel void @add_and(ptr addrspace(1) nocapture %arg) { ; ; GFX9-LABEL: add_and: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_max_u32_e32 v1, 1, v1 ; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, v1, v0 @@ -478,7 +478,7 @@ bb: define amdgpu_kernel void @cmp_sub_sext(ptr addrspace(1) nocapture %arg) { ; GCN-LABEL: cmp_sub_sext: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 @@ -493,7 +493,7 @@ define amdgpu_kernel void @cmp_sub_sext(ptr addrspace(1) nocapture %arg) { ; ; GFX9-LABEL: cmp_sub_sext: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -518,7 +518,7 @@ bb: define amdgpu_kernel void @cmp_sub_zext(ptr addrspace(1) nocapture %arg) { ; GCN-LABEL: cmp_sub_zext: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 @@ -533,7 +533,7 @@ define amdgpu_kernel void @cmp_sub_zext(ptr addrspace(1) nocapture %arg) { ; ; GFX9-LABEL: cmp_sub_zext: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -557,33 +557,33 @@ bb: define amdgpu_kernel void @sub_addcarry(ptr addrspace(1) nocapture %arg, i32 %a) { ; GCN-LABEL: sub_addcarry: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, 0 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s0, s[0:1], 0xb +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v4, v[2:3], s[0:3], 0 addr64 +; GCN-NEXT: buffer_load_dword v4, v[2:3], s[4:7], 0 addr64 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v4, vcc -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s4, v0 -; GCN-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 +; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 +; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 ; GCN-NEXT: s_endpgm ; ; GFX9-LABEL: sub_addcarry: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v3, v2, s[0:1] +; GFX9-NEXT: global_load_dword v3, v2, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc ; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 -; GFX9-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9-NEXT: global_store_dword v2, v0, s[2:3] ; GFX9-NEXT: s_endpgm bb: %x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -601,33 +601,33 @@ bb: define amdgpu_kernel void @sub_subcarry(ptr addrspace(1) nocapture %arg, i32 %a) { ; GCN-LABEL: sub_subcarry: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, 0 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s0, s[0:1], 0xb +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v4, v[2:3], s[0:3], 0 addr64 -; GCN-NEXT: v_mov_b32_e32 v5, s4 +; GCN-NEXT: buffer_load_dword v4, v[2:3], s[4:7], 0 addr64 +; GCN-NEXT: v_mov_b32_e32 v5, s0 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_subb_u32_e32 v0, vcc, v4, v5, vcc -; GCN-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 +; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 ; GCN-NEXT: s_endpgm ; ; GFX9-LABEL: sub_subcarry: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v3, v2, s[0:1] +; GFX9-NEXT: global_load_dword v3, v2, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v3, v4, vcc -; GFX9-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9-NEXT: global_store_dword v2, v0, s[2:3] ; GFX9-NEXT: s_endpgm bb: %x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -646,7 +646,7 @@ bb: define amdgpu_kernel void @sub_zext_setcc_commute(ptr addrspace(1) nocapture %arg, i32 %a, i32%b) { ; GCN-LABEL: sub_zext_setcc_commute: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 @@ -665,17 +665,17 @@ define amdgpu_kernel void @sub_zext_setcc_commute(ptr addrspace(1) nocapture %ar ; ; GFX9-LABEL: sub_zext_setcc_commute: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v3, v2, s[0:1] +; GFX9-NEXT: global_load_dword v3, v2, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v3 -; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 -; GFX9-NEXT: v_subrev_u32_e32 v0, s3, v0 -; GFX9-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9-NEXT: v_add_u32_e32 v0, s6, v0 +; GFX9-NEXT: v_subrev_u32_e32 v0, s7, v0 +; GFX9-NEXT: global_store_dword v2, v0, s[4:5] ; GFX9-NEXT: s_endpgm bb: %x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -695,7 +695,7 @@ bb: define amdgpu_kernel void @sub_sext_setcc_commute(ptr addrspace(1) nocapture %arg, i32 %a, i32%b) { ; GCN-LABEL: sub_sext_setcc_commute: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 @@ -714,17 +714,17 @@ define amdgpu_kernel void @sub_sext_setcc_commute(ptr addrspace(1) nocapture %ar ; ; GFX9-LABEL: sub_sext_setcc_commute: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v3, v2, s[0:1] +; GFX9-NEXT: global_load_dword v3, v2, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v3 -; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 -; GFX9-NEXT: v_subrev_u32_e32 v0, s3, v0 -; GFX9-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9-NEXT: v_add_u32_e32 v0, s6, v0 +; GFX9-NEXT: v_subrev_u32_e32 v0, s7, v0 +; GFX9-NEXT: global_store_dword v2, v0, s[4:5] ; GFX9-NEXT: s_endpgm bb: %x = tail call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll index 63b9d68123fa4..3145ee1f6141e 100644 --- a/llvm/test/CodeGen/AMDGPU/ctlz.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll @@ -23,11 +23,11 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone define amdgpu_kernel void @s_ctlz_i32(ptr addrspace(1) noalias %out, i32 %val) nounwind { ; SI-LABEL: s_ctlz_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_flbit_i32_b32 s2, s4 +; SI-NEXT: s_flbit_i32_b32 s2, s2 ; SI-NEXT: s_min_u32 s4, s2, 32 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -36,8 +36,8 @@ define amdgpu_kernel void @s_ctlz_i32(ptr addrspace(1) noalias %out, i32 %val) n ; ; VI-LABEL: s_ctlz_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -62,36 +62,36 @@ define amdgpu_kernel void @s_ctlz_i32(ptr addrspace(1) noalias %out, i32 %val) n ; GFX10-LABEL: s_ctlz_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_flbit_i32_b32 s2, s4 -; GFX10-NEXT: s_min_u32 s2, s2, 32 -; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_flbit_i32_b32 s0, s4 +; GFX10-NEXT: s_min_u32 s0, s0, 32 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: s_ctlz_i32: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: s_flbit_i32_b32 s2, s4 -; GFX10-GISEL-NEXT: s_min_u32 s2, s2, 32 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: s_flbit_i32_b32 s0, s4 +; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 32 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: s_ctlz_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_clz_i32_u32 s2, s4 +; GFX11-NEXT: s_clz_i32_u32 s2, s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_min_u32 s2, s2, 32 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 @@ -107,7 +107,7 @@ define amdgpu_kernel void @s_ctlz_i32(ptr addrspace(1) noalias %out, i32 %val) n define amdgpu_kernel void @v_ctlz_i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -127,7 +127,7 @@ define amdgpu_kernel void @v_ctlz_i32(ptr addrspace(1) noalias %out, ptr addrspa ; ; VI-LABEL: v_ctlz_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -164,40 +164,39 @@ define amdgpu_kernel void @v_ctlz_i32(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX10-LABEL: v_ctlz_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_ctlz_i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_ctlz_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -214,7 +213,7 @@ define amdgpu_kernel void @v_ctlz_i32(ptr addrspace(1) noalias %out, ptr addrspa define amdgpu_kernel void @v_ctlz_v2i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -236,7 +235,7 @@ define amdgpu_kernel void @v_ctlz_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; VI-LABEL: v_ctlz_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -278,41 +277,39 @@ define amdgpu_kernel void @v_ctlz_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX10-LABEL: v_ctlz_v2i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v1, v1 ; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX10-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_ctlz_v2i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] +; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_ctlz_v2i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -336,7 +333,7 @@ define amdgpu_kernel void @v_ctlz_v2i32(ptr addrspace(1) noalias %out, ptr addrs define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 @@ -362,7 +359,7 @@ define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrs ; ; VI-LABEL: v_ctlz_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -414,11 +411,11 @@ define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX10-LABEL: v_ctlz_v4i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] +; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v3, v3 ; GFX10-NEXT: v_ffbh_u32_e32 v2, v2 @@ -428,16 +425,16 @@ define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrs ; GFX10-NEXT: v_min_u32_e32 v2, 32, v2 ; GFX10-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 -; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_ctlz_v4i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] +; GFX10-GISEL-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 @@ -447,16 +444,14 @@ define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrs ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-GISEL-NEXT: v_min_u32_e32 v2, 32, v2 ; GFX10-GISEL-NEXT: v_min_u32_e32 v3, 32, v3 -; GFX10-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX10-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_ctlz_v4i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v4, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -485,7 +480,7 @@ define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrs define amdgpu_kernel void @v_ctlz_i8(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -505,7 +500,7 @@ define amdgpu_kernel void @v_ctlz_i8(ptr addrspace(1) noalias %out, ptr addrspac ; ; VI-LABEL: v_ctlz_i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -555,33 +550,33 @@ define amdgpu_kernel void @v_ctlz_i8(ptr addrspace(1) noalias %out, ptr addrspac ; ; GFX10-LABEL: v_ctlz_i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX10-NEXT: global_load_ubyte v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v1, v1 ; GFX10-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 24, v1 -; GFX10-NEXT: global_store_byte v0, v1, s[0:1] +; GFX10-NEXT: global_store_byte v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_ctlz_i8: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX10-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v1, 24, v1 -; GFX10-GISEL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX10-GISEL-NEXT: global_store_byte v0, v1, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_ctlz_i8: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v1, v0, s[2:3] @@ -603,8 +598,8 @@ define amdgpu_kernel void @v_ctlz_i8(ptr addrspace(1) noalias %out, ptr addrspac define amdgpu_kernel void @s_ctlz_i64(ptr addrspace(1) noalias %out, [8 x i32], i64 %val) nounwind { ; SI-LABEL: s_ctlz_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -617,8 +612,8 @@ define amdgpu_kernel void @s_ctlz_i64(ptr addrspace(1) noalias %out, [8 x i32], ; ; VI-LABEL: s_ctlz_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v1, 0 @@ -650,11 +645,11 @@ define amdgpu_kernel void @s_ctlz_i64(ptr addrspace(1) noalias %out, [8 x i32], ; GFX10-LABEL: s_ctlz_i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_flbit_i32_b64 s0, s[0:1] +; GFX10-NEXT: s_flbit_i32_b64 s0, s[2:3] ; GFX10-NEXT: s_min_u32 s0, s0, 64 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] @@ -663,12 +658,12 @@ define amdgpu_kernel void @s_ctlz_i64(ptr addrspace(1) noalias %out, [8 x i32], ; GFX10-GISEL-LABEL: s_ctlz_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_mov_b32 s1, 0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: s_flbit_i32_b64 s0, s[0:1] -; GFX10-GISEL-NEXT: s_mov_b32 s1, 0 +; GFX10-GISEL-NEXT: s_flbit_i32_b64 s0, s[2:3] ; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 64 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1 @@ -678,14 +673,14 @@ define amdgpu_kernel void @s_ctlz_i64(ptr addrspace(1) noalias %out, [8 x i32], ; GFX11-LABEL: s_ctlz_i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x4c -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_clz_i32_u64 s0, s[0:1] +; GFX11-NEXT: s_clz_i32_u64 s2, s[2:3] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_min_u32 s0, s0, 64 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s0 -; GFX11-NEXT: global_store_b64 v1, v[0:1], s[2:3] +; GFX11-NEXT: s_min_u32 s2, s2, 64 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-NEXT: global_store_b64 v1, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -697,7 +692,7 @@ define amdgpu_kernel void @s_ctlz_i64(ptr addrspace(1) noalias %out, [8 x i32], define amdgpu_kernel void @s_ctlz_i64_trunc(ptr addrspace(1) noalias %out, i64 %val) nounwind { ; SI-LABEL: s_ctlz_i64_trunc: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_flbit_i32_b64 s2, s[2:3] @@ -711,7 +706,7 @@ define amdgpu_kernel void @s_ctlz_i64_trunc(ptr addrspace(1) noalias %out, i64 % ; ; VI-LABEL: s_ctlz_i64_trunc: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -742,29 +737,29 @@ define amdgpu_kernel void @s_ctlz_i64_trunc(ptr addrspace(1) noalias %out, i64 % ; ; GFX10-LABEL: s_ctlz_i64_trunc: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_flbit_i32_b64 s2, s[2:3] -; GFX10-NEXT: s_min_u32 s2, s2, 64 -; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_flbit_i32_b64 s0, s[6:7] +; GFX10-NEXT: s_min_u32 s0, s0, 64 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: s_ctlz_i64_trunc: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: s_flbit_i32_b64 s2, s[2:3] -; GFX10-GISEL-NEXT: s_min_u32 s2, s2, 64 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: s_flbit_i32_b64 s0, s[6:7] +; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 64 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: s_ctlz_i64_trunc: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clz_i32_u64 s2, s[2:3] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) @@ -783,7 +778,7 @@ define amdgpu_kernel void @s_ctlz_i64_trunc(ptr addrspace(1) noalias %out, i64 % define amdgpu_kernel void @v_ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctlz_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -804,7 +799,7 @@ define amdgpu_kernel void @v_ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; VI-LABEL: v_ctlz_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -852,25 +847,25 @@ define amdgpu_kernel void @v_ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX10-LABEL: v_ctlz_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX10-NEXT: v_ffbh_u32_e32 v1, v1 ; GFX10-NEXT: v_add_nc_u32_e64 v0, v0, 32 clamp ; GFX10-NEXT: v_min3_u32 v0, v0, v1, 64 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_ctlz_i64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] +; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 @@ -878,14 +873,12 @@ define amdgpu_kernel void @v_ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspa ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, v1, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 64, v0 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_ctlz_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] @@ -912,7 +905,7 @@ define amdgpu_kernel void @v_ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspa define amdgpu_kernel void @v_ctlz_i64_trunc(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctlz_i64_trunc: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 @@ -933,7 +926,7 @@ define amdgpu_kernel void @v_ctlz_i64_trunc(ptr addrspace(1) noalias %out, ptr a ; ; VI-LABEL: v_ctlz_i64_trunc: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -981,51 +974,49 @@ define amdgpu_kernel void @v_ctlz_i64_trunc(ptr addrspace(1) noalias %out, ptr a ; ; GFX10-LABEL: v_ctlz_i64_trunc: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[1:2], v1, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[1:2], v1, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v1, v1 ; GFX10-NEXT: v_ffbh_u32_e32 v2, v2 ; GFX10-NEXT: v_add_nc_u32_e64 v1, v1, 32 clamp ; GFX10-NEXT: v_min3_u32 v1, v1, v2, 64 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_ctlz_i64_trunc: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dwordx2 v[1:2], v1, s[2:3] +; GFX10-GISEL-NEXT: global_load_dwordx2 v[1:2], v1, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v2, v2 ; GFX10-GISEL-NEXT: v_add_nc_u32_e64 v1, v1, 32 clamp ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, v2, v1 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 64, v1 -; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_ctlz_i64_trunc: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v2, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v2 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] +; GFX11-NEXT: global_load_b64 v[1:2], v1, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 ; GFX11-NEXT: v_clz_i32_u32_e32 v1, v1 +; GFX11-NEXT: v_clz_i32_u32_e32 v2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_nc_u32_e64 v0, v0, 32 clamp -; GFX11-NEXT: v_min3_u32 v0, v0, v1, 64 -; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-NEXT: v_add_nc_u32_e64 v1, v1, 32 clamp +; GFX11-NEXT: v_min3_u32 v1, v1, v2, 64 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1042,7 +1033,7 @@ define amdgpu_kernel void @v_ctlz_i64_trunc(ptr addrspace(1) noalias %out, ptr a define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i32_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1061,7 +1052,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_ctlz_i32_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1099,37 +1090,35 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-LABEL: v_ctlz_i32_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_ctlz_i32_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc_lo ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_ctlz_i32_sel_eq_neg1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1151,7 +1140,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i32_sel_ne_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1170,7 +1159,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_ctlz_i32_sel_ne_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1208,37 +1197,35 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-LABEL: v_ctlz_i32_sel_ne_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_ctlz_i32_sel_ne_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 ; GFX10-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v1, vcc_lo ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_ctlz_i32_sel_ne_neg1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1261,7 +1248,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i32_sel_eq_bitwidth: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1283,7 +1270,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_ctlz_i32_sel_eq_bitwidth: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1326,46 +1313,44 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_ctlz_i32_sel_eq_bitwidth: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_ctlz_i32_sel_eq_bitwidth: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 32, v0 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_ctlz_i32_sel_eq_bitwidth: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_min_u32_e32 v0, 32, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 ; GFX11-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1385,7 +1370,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias % define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i32_sel_ne_bitwidth: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1407,7 +1392,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_ctlz_i32_sel_ne_bitwidth: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1450,46 +1435,44 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_ctlz_i32_sel_ne_bitwidth: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_ctlz_i32_sel_ne_bitwidth: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX10-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 ; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_ctlz_i32_sel_ne_bitwidth: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_min_u32_e32 v0, 32, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 ; GFX11-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1509,7 +1492,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % define amdgpu_kernel void @v_ctlz_i8_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i8_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s10, 0 @@ -1527,7 +1510,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_ctlz_i8_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -1569,22 +1552,22 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_ctlz_i8_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] +; GFX10-NEXT: global_load_ubyte v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 -; GFX10-NEXT: global_store_byte v1, v0, s[0:1] +; GFX10-NEXT: global_store_byte v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_ctlz_i8_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s3 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s7 ; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 ; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo ; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off @@ -1595,13 +1578,13 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v1, 24, v1 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc_lo ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[0:1] +; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_ctlz_i8_sel_eq_neg1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1623,7 +1606,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % define amdgpu_kernel void @v_ctlz_i16_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i16_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1641,7 +1624,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_ctlz_i16_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1691,25 +1674,25 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_ctlz_i16_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v2, v1 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 ; GFX10-NEXT: v_min_u32_e32 v2, 32, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v2, -16, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo -; GFX10-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_ctlz_i16_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX10-GISEL-NEXT: global_load_ushort v1, v0, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v2, v1 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 @@ -1717,12 +1700,12 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v2, 16, v2 ; GFX10-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v2, 0xffff, vcc_lo -; GFX10-GISEL-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-GISEL-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_ctlz_i16_sel_eq_neg1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] @@ -1750,7 +1733,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i7_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s10, 0 @@ -1769,7 +1752,7 @@ define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_ctlz_i7_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -1812,23 +1795,23 @@ define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-LABEL: v_ctlz_i7_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] +; GFX10-NEXT: global_load_ubyte v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX10-NEXT: v_and_b32_e32 v0, 0x7f, v0 -; GFX10-NEXT: global_store_byte v1, v0, s[0:1] +; GFX10-NEXT: global_store_byte v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_ctlz_i7_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s3 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s7 ; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 ; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo ; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off @@ -1841,19 +1824,18 @@ define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0x7f, vcc_lo ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0x7f, v0 -; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[0:1] +; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_ctlz_i7_sel_eq_neg1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, 0x7f, v0 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x7f, v0 ; GFX11-NEXT: global_store_b8 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll index f16f05811c185..a377714ebf737 100644 --- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll @@ -29,11 +29,11 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone define amdgpu_kernel void @s_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, i32 %val) nounwind { ; SI-LABEL: s_ctlz_zero_undef_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_flbit_i32_b32 s4, s4 +; SI-NEXT: s_flbit_i32_b32 s4, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -41,10 +41,10 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, ; ; VI-LABEL: s_ctlz_zero_undef_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_flbit_i32_b32 s2, s4 +; VI-NEXT: s_flbit_i32_b32 s2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -64,13 +64,13 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, ; ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_flbit_i32_b32 s2, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-GISEL-NEXT: s_flbit_i32_b32 s0, s4 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_endpgm %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone store i32 %ctlz, ptr addrspace(1) %out, align 4 @@ -80,7 +80,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -99,7 +99,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_ctlz_zero_undef_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -134,14 +134,14 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[6:7] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid @@ -154,7 +154,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -174,7 +174,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(ptr addrspace(1) noalias %out ; ; VI-LABEL: v_ctlz_zero_undef_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -211,15 +211,15 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(ptr addrspace(1) noalias %out ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 -; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr <2 x i32>, ptr addrspace(1) %valptr, i32 %tid @@ -232,7 +232,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(ptr addrspace(1) noalias %out define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 @@ -254,7 +254,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(ptr addrspace(1) noalias %out ; ; VI-LABEL: v_ctlz_zero_undef_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -295,17 +295,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(ptr addrspace(1) noalias %out ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v4i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v2 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v3, v3 -; GFX9-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX9-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr <4 x i32>, ptr addrspace(1) %valptr, i32 %tid @@ -318,11 +318,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(ptr addrspace(1) noalias %out define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noalias %out, i8 %val) nounwind { ; SI-LABEL: s_ctlz_zero_undef_i8_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s2, s4, 24 +; SI-NEXT: s_lshl_b32 s2, s2, 24 ; SI-NEXT: s_flbit_i32_b32 s4, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -331,10 +331,10 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa ; ; VI-LABEL: s_ctlz_zero_undef_i8_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s2, s4, 24 +; VI-NEXT: s_lshl_b32 s2, s2, 24 ; VI-NEXT: s_flbit_i32_b32 s2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -373,14 +373,14 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa ; ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i8_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_lshl_b32 s2, s4, 24 -; GFX9-GISEL-NEXT: s_flbit_i32_b32 s2, s2 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[0:1] +; GFX9-GISEL-NEXT: s_lshl_b32 s0, s4, 24 +; GFX9-GISEL-NEXT: s_flbit_i32_b32 s0, s0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_endpgm %ctlz = tail call i8 @llvm.ctlz.i8(i8 %val, i1 true) nounwind readnone %ctlz_ret = icmp ne i8 %val, 0 @@ -392,11 +392,11 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) noalias %out, i16 %val) nounwind { ; SI-LABEL: s_ctlz_zero_undef_i16_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s2, s4, 16 +; SI-NEXT: s_lshl_b32 s2, s2, 16 ; SI-NEXT: s_flbit_i32_b32 s4, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -405,11 +405,12 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; ; VI-LABEL: s_ctlz_zero_undef_i16_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s2, s4, 16 +; VI-NEXT: s_and_b32 s2, s2, 0xffff ; VI-NEXT: s_flbit_i32_b32 s2, s2 +; VI-NEXT: s_add_i32 s2, s2, -16 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -447,14 +448,14 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i16_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_lshl_b32 s2, s4, 16 -; GFX9-GISEL-NEXT: s_flbit_i32_b32 s2, s2 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-GISEL-NEXT: global_store_short v1, v0, s[0:1] +; GFX9-GISEL-NEXT: s_lshl_b32 s0, s4, 16 +; GFX9-GISEL-NEXT: s_flbit_i32_b32 s0, s0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-GISEL-NEXT: global_store_short v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_endpgm %ctlz = tail call i16 @llvm.ctlz.i16(i16 %val, i1 true) nounwind readnone %ctlz_ret = icmp ne i16 %val, 0 @@ -466,11 +467,11 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no define amdgpu_kernel void @s_ctlz_zero_undef_i32_with_select(ptr addrspace(1) noalias %out, i32 %val) nounwind { ; SI-LABEL: s_ctlz_zero_undef_i32_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_flbit_i32_b32 s4, s4 +; SI-NEXT: s_flbit_i32_b32 s4, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -478,10 +479,10 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no ; ; VI-LABEL: s_ctlz_zero_undef_i32_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_flbit_i32_b32 s2, s4 +; VI-NEXT: s_flbit_i32_b32 s2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -501,13 +502,13 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i32_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_flbit_i32_b32 s2, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-GISEL-NEXT: s_flbit_i32_b32 s0, s4 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_endpgm %ctlz = tail call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone %ctlz_ret = icmp ne i32 %val, 0 @@ -519,7 +520,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no define amdgpu_kernel void @s_ctlz_zero_undef_i64_with_select(ptr addrspace(1) noalias %out, i64 %val) nounwind { ; SI-LABEL: s_ctlz_zero_undef_i64_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -533,7 +534,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no ; ; VI-LABEL: s_ctlz_zero_undef_i64_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_flbit_i32_b64 s2, s[2:3] @@ -561,14 +562,14 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i64_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX9-GISEL-NEXT: s_mov_b32 s5, 0 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_mov_b32 s1, 0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_flbit_i32_b64 s4, s[2:3] -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-GISEL-NEXT: s_flbit_i32_b64 s0, s[6:7] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %ctlz = tail call i64 @llvm.ctlz.i64(i64 %val, i1 true) nounwind readnone %ctlz_ret = icmp ne i64 %val, 0 @@ -580,7 +581,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i8_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -601,7 +602,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa ; ; VI-LABEL: v_ctlz_zero_undef_i8_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -649,17 +650,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i8_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v1 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v2 ; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc -; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %val = load i8, ptr addrspace(1) %arrayidx, align 1 %ctlz = tail call i8 @llvm.ctlz.i8(i8 %val, i1 true) nounwind readnone @@ -672,7 +673,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i16_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -697,7 +698,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; ; VI-LABEL: v_ctlz_zero_undef_i16_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 1 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -711,8 +712,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 -; VI-NEXT: v_ffbh_u32_e32 v1, v1 +; VI-NEXT: v_ffbh_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, -16, v1 ; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v2, 32, v1, vcc ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -753,11 +754,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i16_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] -; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 +; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] +; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[6:7] offset:1 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v1 @@ -765,7 +766,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc -; GFX9-GISEL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-GISEL-NEXT: global_store_short v0, v1, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %val = load i16, ptr addrspace(1) %arrayidx, align 1 %ctlz = tail call i16 @llvm.ctlz.i16(i16 %val, i1 true) nounwind readnone @@ -778,7 +779,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i32_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -809,7 +810,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no ; ; VI-LABEL: v_ctlz_zero_undef_i32_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 3 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -869,13 +870,13 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] -; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 -; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[2:3] offset:3 -; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[2:3] offset:2 +; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] +; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[6:7] offset:1 +; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[6:7] offset:3 +; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[6:7] offset:2 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1) @@ -886,7 +887,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v1 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %val = load i32, ptr addrspace(1) %arrayidx, align 1 %ctlz = tail call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone @@ -899,7 +900,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i64_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s10, s2 @@ -946,7 +947,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no ; ; VI-LABEL: v_ctlz_zero_undef_i64_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 5 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -1050,17 +1051,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v0, v1, s[2:3] -; GFX9-GISEL-NEXT: global_load_ubyte v2, v1, s[2:3] offset:1 -; GFX9-GISEL-NEXT: global_load_ubyte v3, v1, s[2:3] offset:2 -; GFX9-GISEL-NEXT: global_load_ubyte v4, v1, s[2:3] offset:3 -; GFX9-GISEL-NEXT: global_load_ubyte v5, v1, s[2:3] offset:4 -; GFX9-GISEL-NEXT: global_load_ubyte v6, v1, s[2:3] offset:5 -; GFX9-GISEL-NEXT: global_load_ubyte v7, v1, s[2:3] offset:6 -; GFX9-GISEL-NEXT: global_load_ubyte v8, v1, s[2:3] offset:7 +; GFX9-GISEL-NEXT: global_load_ubyte v0, v1, s[6:7] +; GFX9-GISEL-NEXT: global_load_ubyte v2, v1, s[6:7] offset:1 +; GFX9-GISEL-NEXT: global_load_ubyte v3, v1, s[6:7] offset:2 +; GFX9-GISEL-NEXT: global_load_ubyte v4, v1, s[6:7] offset:3 +; GFX9-GISEL-NEXT: global_load_ubyte v5, v1, s[6:7] offset:4 +; GFX9-GISEL-NEXT: global_load_ubyte v6, v1, s[6:7] offset:5 +; GFX9-GISEL-NEXT: global_load_ubyte v7, v1, s[6:7] offset:6 +; GFX9-GISEL-NEXT: global_load_ubyte v8, v1, s[6:7] offset:7 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(6) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v2, 8, v0 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(5) @@ -1081,7 +1082,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no ; GFX9-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; GFX9-GISEL-NEXT: v_min_u32_e32 v0, v4, v0 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 64, v0, vcc -; GFX9-GISEL-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] +; GFX9-GISEL-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %val = load i64, ptr addrspace(1) %arrayidx, align 1 %ctlz = tail call i64 @llvm.ctlz.i64(i64 %val, i1 true) nounwind readnone @@ -1094,7 +1095,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s10, 0 @@ -1113,7 +1114,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p ; ; VI-LABEL: v_ctlz_zero_undef_i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -1158,11 +1159,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i8: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s7 ; GFX9-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v1, v0 ; GFX9-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v3, vcc ; GFX9-GISEL-NEXT: global_load_ubyte v0, v[0:1], off @@ -1170,7 +1171,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 -; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[0:1] +; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i8, ptr addrspace(1) %valptr, i32 %tid @@ -1183,8 +1184,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @s_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, [8 x i32], i64 %val) nounwind { ; SI-LABEL: s_ctlz_zero_undef_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1196,14 +1197,14 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, ; ; VI-LABEL: s_ctlz_zero_undef_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_flbit_i32_b64 s0, s[0:1] -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_flbit_i32_b64 s2, s[2:3] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; @@ -1225,14 +1226,14 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, ; ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i64: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-GISEL-NEXT: s_mov_b32 s3, 0 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_mov_b32 s1, 0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_flbit_i32_b64 s2, s[0:1] -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-GISEL-NEXT: s_flbit_i32_b64 s0, s[2:3] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true) @@ -1243,7 +1244,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias %out, i64 %val) nounwind { ; SI-LABEL: s_ctlz_zero_undef_i64_trunc: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_flbit_i32_b64 s2, s[2:3] @@ -1256,7 +1257,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias ; ; VI-LABEL: s_ctlz_zero_undef_i64_trunc: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_flbit_i32_b64 s2, s[2:3] ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1282,12 +1283,12 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias ; ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i64_trunc: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_flbit_i32_b64 s2, s[2:3] -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-GISEL-NEXT: s_flbit_i32_b64 s0, s[6:7] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true) %trunc = trunc i64 %ctlz to i32 @@ -1298,7 +1299,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias define amdgpu_kernel void @v_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -1318,7 +1319,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_ctlz_zero_undef_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1364,17 +1365,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] +; GFX9-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 ; GFX9-GISEL-NEXT: v_add_u32_e32 v0, 32, v0 ; GFX9-GISEL-NEXT: v_min_u32_e32 v0, v1, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid @@ -1388,7 +1389,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i64_trunc: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 @@ -1408,7 +1409,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias ; ; VI-LABEL: v_ctlz_zero_undef_i64_trunc: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1454,17 +1455,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64_trunc: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dwordx2 v[1:2], v1, s[2:3] +; GFX9-GISEL-NEXT: global_load_dwordx2 v[1:2], v1, s[6:7] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v2 ; GFX9-GISEL-NEXT: v_add_u32_e32 v1, 32, v1 ; GFX9-GISEL-NEXT: v_min_u32_e32 v1, v2, v1 -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid @@ -1479,7 +1480,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1498,7 +1499,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(ptr addrspace(1) no ; ; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1534,16 +1535,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[6:7] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid @@ -1558,7 +1559,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(ptr addrspace(1) no define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i32_sel_ne_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1577,7 +1578,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(ptr addrspace(1) no ; ; VI-LABEL: v_ctlz_zero_undef_i32_sel_ne_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1613,16 +1614,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[6:7] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v1, vcc ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid @@ -1637,7 +1638,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(ptr addrspace(1) no define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i8_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s10, 0 @@ -1655,7 +1656,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noa ; ; VI-LABEL: v_ctlz_zero_undef_i8_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -1697,11 +1698,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noa ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i8_sel_eq_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s7 ; GFX9-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v1, v0 ; GFX9-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v3, vcc ; GFX9-GISEL-NEXT: global_load_ubyte v0, v[0:1], off @@ -1709,9 +1710,9 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noa ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v0 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v2 -; GFX9-GISEL-NEXT: v_cmp_eq_u32_sdwa s[2:3], v0, v1 src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, -1, s[2:3] -; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[0:1] +; GFX9-GISEL-NEXT: v_cmp_eq_u32_sdwa s[0:1], v0, v1 src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, -1, s[0:1] +; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %valptr.gep = getelementptr i8, ptr addrspace(1) %valptr, i32 %tid @@ -1726,7 +1727,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noa define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1_two_use: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1750,7 +1751,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(ptr addrspa ; ; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1_two_use: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1799,17 +1800,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(ptr addrspa ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1_two_use: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[6:7] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v0 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc ; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) @@ -1829,7 +1830,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(ptr addrspa define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i32_sel_eq_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1850,7 +1851,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(ptr addrspace(1) noali ; ; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1888,16 +1889,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(ptr addrspace(1) noali ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[6:7] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid @@ -1913,7 +1914,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(ptr addrspace(1) noali define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i32_sel_ne_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1934,7 +1935,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(ptr addrspace(1) noali ; ; VI-LABEL: v_ctlz_zero_undef_i32_sel_ne_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1972,16 +1973,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(ptr addrspace(1) noali ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[6:7] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid @@ -1997,7 +1998,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(ptr addrspace(1) noali define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i32_sel_eq_cmp_non0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2018,7 +2019,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(ptr addrspace(1 ; ; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_cmp_non0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2057,16 +2058,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(ptr addrspace(1 ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_cmp_non0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[6:7] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid @@ -2082,7 +2083,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(ptr addrspace(1 define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i32_sel_ne_cmp_non0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2103,7 +2104,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(ptr addrspace(1 ; ; VI-LABEL: v_ctlz_zero_undef_i32_sel_ne_cmp_non0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2142,16 +2143,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(ptr addrspace(1 ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_cmp_non0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[6:7] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid @@ -2167,15 +2168,18 @@ define i7 @v_ctlz_zero_undef_i7(i7 %val) { ; SI-LABEL: v_ctlz_zero_undef_i7: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 25, v0 +; SI-NEXT: v_and_b32_e32 v0, 0x7f, v0 ; SI-NEXT: v_ffbh_u32_e32 v0, v0 +; SI-NEXT: v_subrev_i32_e32 v0, vcc, 25, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_ctlz_zero_undef_i7: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v0, 25, v0 -; VI-NEXT: v_ffbh_u32_e32 v0, v0 +; VI-NEXT: v_and_b32_e32 v0, 0x7f, v0 +; VI-NEXT: v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, -16, v0 +; VI-NEXT: v_add_u16_e32 v0, -9, v0 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; EG-LABEL: v_ctlz_zero_undef_i7: @@ -2196,12 +2200,13 @@ define i7 @v_ctlz_zero_undef_i7(i7 %val) { define amdgpu_kernel void @s_ctlz_zero_undef_i18(ptr addrspace(1) noalias %out, i18 %val) nounwind { ; SI-LABEL: s_ctlz_zero_undef_i18: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s2, s4, 14 -; SI-NEXT: s_flbit_i32_b32 s4, s2 +; SI-NEXT: s_and_b32 s2, s2, 0x3ffff +; SI-NEXT: s_flbit_i32_b32 s2, s2 +; SI-NEXT: s_add_i32 s4, s2, -14 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: s_bfe_u32 s4, s4, 0x20010 @@ -2213,17 +2218,18 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i18(ptr addrspace(1) noalias %out, ; ; VI-LABEL: s_ctlz_zero_undef_i18: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s2, s4, 14 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_and_b32 s2, s2, 0x3ffff ; VI-NEXT: s_flbit_i32_b32 s2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_add_i32 s2, s2, -14 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_add_u32 s0, s0, 2 ; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: s_add_u32 s0, s0, 2 ; VI-NEXT: flat_store_short v[0:1], v2 +; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: s_bfe_u32 s2, s2, 0x20010 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -2233,18 +2239,20 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i18(ptr addrspace(1) noalias %out, ; ; EG-LABEL: s_ctlz_zero_undef_i18: ; EG: ; %bb.0: -; EG-NEXT: ALU 28, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 30, @4, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT MSKOR T1.XW, T3.X ; EG-NEXT: MEM_RAT MSKOR T0.XW, T2.X ; EG-NEXT: CF_END ; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: LSHL * T0.W, KC0[2].Z, literal.x, -; EG-NEXT: 14(1.961818e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 262143(3.673406e-40), 0(0.000000e+00) ; EG-NEXT: FFBH_UINT T0.W, PV.W, ; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) +; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x, +; EG-NEXT: -14(nan), 0(0.000000e+00) ; EG-NEXT: AND_INT T2.W, PV.W, literal.x, -; EG-NEXT: LSHL * T1.W, PS, literal.y, +; EG-NEXT: LSHL * T1.W, T1.W, literal.y, ; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45) ; EG-NEXT: LSHL T1.X, PV.W, PS, ; EG-NEXT: LSHL * T1.W, literal.x, PS, @@ -2270,18 +2278,18 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i18(ptr addrspace(1) noalias %out, ; ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i18: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_lshl_b32 s2, s4, 14 -; GFX9-GISEL-NEXT: s_flbit_i32_b32 s2, s2 -; GFX9-GISEL-NEXT: s_and_b32 s2, s2, 0x3ffff -; GFX9-GISEL-NEXT: s_lshr_b32 s3, s2, 16 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-GISEL-NEXT: global_store_short v0, v1, s[0:1] -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[0:1] offset:2 +; GFX9-GISEL-NEXT: s_lshl_b32 s0, s4, 14 +; GFX9-GISEL-NEXT: s_flbit_i32_b32 s0, s0 +; GFX9-GISEL-NEXT: s_and_b32 s0, s0, 0x3ffff +; GFX9-GISEL-NEXT: s_lshr_b32 s1, s0, 16 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-GISEL-NEXT: global_store_short v0, v1, s[2:3] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[2:3] offset:2 ; GFX9-GISEL-NEXT: s_endpgm %ctlz = call i18 @llvm.ctlz.i18(i18 %val, i1 true) nounwind readnone store i18 %ctlz, ptr addrspace(1) %out, align 4 @@ -2292,15 +2300,17 @@ define i18 @v_ctlz_zero_undef_i18(i18 %val) { ; SI-LABEL: v_ctlz_zero_undef_i18: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 14, v0 +; SI-NEXT: v_and_b32_e32 v0, 0x3ffff, v0 ; SI-NEXT: v_ffbh_u32_e32 v0, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, -14, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_ctlz_zero_undef_i18: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v0, 14, v0 +; VI-NEXT: v_and_b32_e32 v0, 0x3ffff, v0 ; VI-NEXT: v_ffbh_u32_e32 v0, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, -14, v0 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; EG-LABEL: v_ctlz_zero_undef_i18: @@ -2322,19 +2332,23 @@ define <2 x i18> @v_ctlz_zero_undef_v2i18(<2 x i18> %val) { ; SI-LABEL: v_ctlz_zero_undef_v2i18: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 14, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 14, v1 +; SI-NEXT: v_and_b32_e32 v1, 0x3ffff, v1 +; SI-NEXT: v_and_b32_e32 v0, 0x3ffff, v0 ; SI-NEXT: v_ffbh_u32_e32 v0, v0 ; SI-NEXT: v_ffbh_u32_e32 v1, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, -14, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, -14, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_ctlz_zero_undef_v2i18: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v0, 14, v0 -; VI-NEXT: v_lshlrev_b32_e32 v1, 14, v1 +; VI-NEXT: v_and_b32_e32 v1, 0x3ffff, v1 +; VI-NEXT: v_and_b32_e32 v0, 0x3ffff, v0 ; VI-NEXT: v_ffbh_u32_e32 v0, v0 ; VI-NEXT: v_ffbh_u32_e32 v1, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, -14, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, -14, v1 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; EG-LABEL: v_ctlz_zero_undef_v2i18: @@ -2369,11 +2383,11 @@ define <2 x i16> @v_ctlz_zero_undef_v2i16(<2 x i16> %val) { ; VI-LABEL: v_ctlz_zero_undef_v2i16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_ffbh_u32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; VI-NEXT: v_ffbh_u32_e32 v0, v0 -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_ffbh_u32_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, -16, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0xfff00000, v0 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; EG-LABEL: v_ctlz_zero_undef_v2i16: @@ -2415,13 +2429,13 @@ define <3 x i16> @v_ctlz_zero_undef_v3i16(<3 x i16> %val) { ; VI-LABEL: v_ctlz_zero_undef_v3i16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_ffbh_u32_e32 v2, v2 -; VI-NEXT: v_ffbh_u32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; VI-NEXT: v_ffbh_u32_e32 v1, v1 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_ffbh_u32_sdwa v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, -16, v0 +; VI-NEXT: v_ffbh_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, -16, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0xfff00000, v0 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; EG-LABEL: v_ctlz_zero_undef_v3i16: @@ -2469,16 +2483,16 @@ define <4 x i16> @v_ctlz_zero_undef_v4i16(<4 x i16> %val) { ; VI-LABEL: v_ctlz_zero_undef_v4i16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; VI-NEXT: v_ffbh_u32_e32 v2, v2 -; VI-NEXT: v_ffbh_u32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; VI-NEXT: v_ffbh_u32_e32 v3, v3 -; VI-NEXT: v_ffbh_u32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_ffbh_u32_sdwa v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_ffbh_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; VI-NEXT: v_ffbh_u32_sdwa v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, -16, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, -16, v0 +; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0xfff00000, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0xfff00000, v1 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; EG-LABEL: v_ctlz_zero_undef_v4i16: @@ -2553,19 +2567,28 @@ define <2 x i7> @v_ctlz_zero_undef_v2i7(<2 x i7> %val) { ; SI-LABEL: v_ctlz_zero_undef_v2i7: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 25, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 25, v1 +; SI-NEXT: v_and_b32_e32 v1, 0x7f, v1 +; SI-NEXT: v_and_b32_e32 v0, 0x7f, v0 ; SI-NEXT: v_ffbh_u32_e32 v0, v0 ; SI-NEXT: v_ffbh_u32_e32 v1, v1 +; SI-NEXT: v_subrev_i32_e32 v0, vcc, 25, v0 +; SI-NEXT: v_subrev_i32_e32 v1, vcc, 25, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_ctlz_zero_undef_v2i7: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v0, 25, v0 -; VI-NEXT: v_lshlrev_b32_e32 v1, 25, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v2, 0x7f007f, v0 +; VI-NEXT: v_bfe_u32 v0, v0, 16, 7 ; VI-NEXT: v_ffbh_u32_e32 v0, v0 -; VI-NEXT: v_ffbh_u32_e32 v1, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, -16, v0 +; VI-NEXT: v_add_u16_e32 v1, -9, v0 +; VI-NEXT: v_and_b32_e32 v0, 0x7f, v2 +; VI-NEXT: v_ffbh_u32_e32 v0, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, -16, v0 +; VI-NEXT: v_add_u16_e32 v0, -9, v0 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; EG-LABEL: v_ctlz_zero_undef_v2i7: diff --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll index 40929d5883447..b6359f1816979 100644 --- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll +++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll @@ -14,8 +14,8 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone define amdgpu_kernel void @s_ctpop_i16(ptr addrspace(1) noalias %out, i16 %val) nounwind { ; SI-LABEL: s_ctpop_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -27,8 +27,8 @@ define amdgpu_kernel void @s_ctpop_i16(ptr addrspace(1) noalias %out, i16 %val) ; ; VI-LABEL: s_ctpop_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -72,7 +72,7 @@ define amdgpu_kernel void @s_ctpop_i16(ptr addrspace(1) noalias %out, i16 %val) define amdgpu_kernel void @v_ctpop_i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -91,7 +91,7 @@ define amdgpu_kernel void @v_ctpop_i16(ptr addrspace(1) noalias %out, ptr addrsp ; ; VI-LABEL: v_ctpop_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -142,8 +142,8 @@ define amdgpu_kernel void @v_ctpop_i16(ptr addrspace(1) noalias %out, ptr addrsp define amdgpu_kernel void @v_ctpop_add_chain_i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in0, ptr addrspace(1) noalias %in1) nounwind { ; SI-LABEL: v_ctpop_add_chain_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: s_mov_b32 s15, s11 @@ -166,8 +166,8 @@ define amdgpu_kernel void @v_ctpop_add_chain_i16(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_ctpop_add_chain_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -239,8 +239,8 @@ define amdgpu_kernel void @v_ctpop_add_chain_i16(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_ctpop_add_sgpr_i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i16 %sval) nounwind { ; SI-LABEL: v_ctpop_add_sgpr_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s12, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s12, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s3 @@ -259,8 +259,8 @@ define amdgpu_kernel void @v_ctpop_add_sgpr_i16(ptr addrspace(1) noalias %out, p ; ; VI-LABEL: v_ctpop_add_sgpr_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -320,7 +320,7 @@ define amdgpu_kernel void @v_ctpop_add_sgpr_i16(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @v_ctpop_v2i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_v2i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -344,7 +344,7 @@ define amdgpu_kernel void @v_ctpop_v2i16(ptr addrspace(1) noalias %out, ptr addr ; ; VI-LABEL: v_ctpop_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -400,7 +400,7 @@ define amdgpu_kernel void @v_ctpop_v2i16(ptr addrspace(1) noalias %out, ptr addr define amdgpu_kernel void @v_ctpop_v4i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_v4i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -430,7 +430,7 @@ define amdgpu_kernel void @v_ctpop_v4i16(ptr addrspace(1) noalias %out, ptr addr ; ; VI-LABEL: v_ctpop_v4i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -520,7 +520,7 @@ define amdgpu_kernel void @v_ctpop_v4i16(ptr addrspace(1) noalias %out, ptr addr define amdgpu_kernel void @v_ctpop_v8i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_v8i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s3 @@ -562,7 +562,7 @@ define amdgpu_kernel void @v_ctpop_v8i16(ptr addrspace(1) noalias %out, ptr addr ; ; VI-LABEL: v_ctpop_v8i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -700,7 +700,7 @@ define amdgpu_kernel void @v_ctpop_v8i16(ptr addrspace(1) noalias %out, ptr addr define amdgpu_kernel void @v_ctpop_v16i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_v16i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s3 @@ -769,7 +769,7 @@ define amdgpu_kernel void @v_ctpop_v16i16(ptr addrspace(1) noalias %out, ptr add ; ; VI-LABEL: v_ctpop_v16i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 5, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1016,7 +1016,7 @@ define amdgpu_kernel void @v_ctpop_v16i16(ptr addrspace(1) noalias %out, ptr add define amdgpu_kernel void @v_ctpop_i16_add_inline_constant(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_i16_add_inline_constant: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -1035,7 +1035,7 @@ define amdgpu_kernel void @v_ctpop_i16_add_inline_constant(ptr addrspace(1) noal ; ; VI-LABEL: v_ctpop_i16_add_inline_constant: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1088,7 +1088,7 @@ define amdgpu_kernel void @v_ctpop_i16_add_inline_constant(ptr addrspace(1) noal define amdgpu_kernel void @v_ctpop_i16_add_inline_constant_inv(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_i16_add_inline_constant_inv: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -1107,7 +1107,7 @@ define amdgpu_kernel void @v_ctpop_i16_add_inline_constant_inv(ptr addrspace(1) ; ; VI-LABEL: v_ctpop_i16_add_inline_constant_inv: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1160,7 +1160,7 @@ define amdgpu_kernel void @v_ctpop_i16_add_inline_constant_inv(ptr addrspace(1) define amdgpu_kernel void @v_ctpop_i16_add_literal(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_i16_add_literal: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -1180,7 +1180,7 @@ define amdgpu_kernel void @v_ctpop_i16_add_literal(ptr addrspace(1) noalias %out ; ; VI-LABEL: v_ctpop_i16_add_literal: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_movk_i32 s4, 0x3e7 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1234,8 +1234,8 @@ define amdgpu_kernel void @v_ctpop_i16_add_literal(ptr addrspace(1) noalias %out define amdgpu_kernel void @v_ctpop_i16_add_var(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i16 %const) nounwind { ; SI-LABEL: v_ctpop_i16_add_var: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s12, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s12, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s3 @@ -1254,8 +1254,8 @@ define amdgpu_kernel void @v_ctpop_i16_add_var(ptr addrspace(1) noalias %out, pt ; ; VI-LABEL: v_ctpop_i16_add_var: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -1315,8 +1315,8 @@ define amdgpu_kernel void @v_ctpop_i16_add_var(ptr addrspace(1) noalias %out, pt define amdgpu_kernel void @v_ctpop_i16_add_var_inv(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i16 %const) nounwind { ; SI-LABEL: v_ctpop_i16_add_var_inv: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s12, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s12, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s3 @@ -1335,8 +1335,8 @@ define amdgpu_kernel void @v_ctpop_i16_add_var_inv(ptr addrspace(1) noalias %out ; ; VI-LABEL: v_ctpop_i16_add_var_inv: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -1396,8 +1396,8 @@ define amdgpu_kernel void @v_ctpop_i16_add_var_inv(ptr addrspace(1) noalias %out define amdgpu_kernel void @v_ctpop_i16_add_vvar_inv(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %constptr) nounwind { ; SI-LABEL: v_ctpop_i16_add_vvar_inv: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: s_mov_b32 s15, s11 @@ -1418,8 +1418,8 @@ define amdgpu_kernel void @v_ctpop_i16_add_vvar_inv(ptr addrspace(1) noalias %ou ; ; VI-LABEL: v_ctpop_i16_add_vvar_inv: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -1487,8 +1487,8 @@ define amdgpu_kernel void @v_ctpop_i16_add_vvar_inv(ptr addrspace(1) noalias %ou define amdgpu_kernel void @ctpop_i16_in_br(ptr addrspace(1) %out, ptr addrspace(1) %in, i16 %ctpop_arg, i16 %cond) { ; SI-LABEL: ctpop_i16_in_br: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshr_b32 s5, s4, 16 ; SI-NEXT: s_cmp_lg_u32 s5, 0 @@ -1517,8 +1517,8 @@ define amdgpu_kernel void @ctpop_i16_in_br(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: ctpop_i16_in_br: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s5, s4, 16 ; VI-NEXT: s_cmp_lg_u32 s5, 0 diff --git a/llvm/test/CodeGen/AMDGPU/ctpop64.ll b/llvm/test/CodeGen/AMDGPU/ctpop64.ll index 1c16612bed37f..131ce14a7847c 100644 --- a/llvm/test/CodeGen/AMDGPU/ctpop64.ll +++ b/llvm/test/CodeGen/AMDGPU/ctpop64.ll @@ -16,8 +16,8 @@ declare i128 @llvm.ctpop.i128(i128) nounwind readnone define amdgpu_kernel void @s_ctpop_i64(ptr addrspace(1) noalias %out, [8 x i32], i64 %val) nounwind { ; SI-LABEL: s_ctpop_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -28,8 +28,8 @@ define amdgpu_kernel void @s_ctpop_i64(ptr addrspace(1) noalias %out, [8 x i32], ; ; VI-LABEL: s_ctpop_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -46,7 +46,7 @@ define amdgpu_kernel void @s_ctpop_i64(ptr addrspace(1) noalias %out, [8 x i32], define amdgpu_kernel void @v_ctpop_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -66,7 +66,7 @@ define amdgpu_kernel void @v_ctpop_i64(ptr addrspace(1) noalias %out, ptr addrsp ; ; VI-LABEL: v_ctpop_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -92,8 +92,8 @@ define amdgpu_kernel void @v_ctpop_i64(ptr addrspace(1) noalias %out, ptr addrsp define amdgpu_kernel void @v_ctpop_i64_user(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i64 %s.val) nounwind { ; SI-LABEL: v_ctpop_i64_user: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s3 @@ -115,8 +115,8 @@ define amdgpu_kernel void @v_ctpop_i64_user(ptr addrspace(1) noalias %out, ptr a ; ; VI-LABEL: v_ctpop_i64_user: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -144,8 +144,8 @@ define amdgpu_kernel void @v_ctpop_i64_user(ptr addrspace(1) noalias %out, ptr a define amdgpu_kernel void @s_ctpop_v2i64(ptr addrspace(1) noalias %out, <2 x i64> %val) nounwind { ; SI-LABEL: s_ctpop_v2i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -158,8 +158,8 @@ define amdgpu_kernel void @s_ctpop_v2i64(ptr addrspace(1) noalias %out, <2 x i64 ; ; VI-LABEL: s_ctpop_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -178,38 +178,38 @@ define amdgpu_kernel void @s_ctpop_v2i64(ptr addrspace(1) noalias %out, <2 x i64 define amdgpu_kernel void @s_ctpop_v4i64(ptr addrspace(1) noalias %out, <4 x i64> %val) nounwind { ; SI-LABEL: s_ctpop_v4i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x11 -; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s15, 0xf000 -; SI-NEXT: s_mov_b32 s14, -1 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bcnt1_i32_b64 s0, s[4:5] -; SI-NEXT: s_bcnt1_i32_b64 s1, s[6:7] -; SI-NEXT: s_bcnt1_i32_b64 s2, s[8:9] -; SI-NEXT: s_bcnt1_i32_b64 s3, s[10:11] -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: v_mov_b32_e32 v2, s2 -; SI-NEXT: v_mov_b32_e32 v3, s3 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; SI-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; SI-NEXT: s_bcnt1_i32_b64 s5, s[6:7] +; SI-NEXT: s_bcnt1_i32_b64 s6, s[8:9] +; SI-NEXT: s_bcnt1_i32_b64 s7, s[10:11] +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_ctpop_v4i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 -; VI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x24 -; VI-NEXT: s_mov_b32 s15, 0xf000 -; VI-NEXT: s_mov_b32 s14, -1 +; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bcnt1_i32_b64 s0, s[4:5] -; VI-NEXT: s_bcnt1_i32_b64 s1, s[6:7] -; VI-NEXT: s_bcnt1_i32_b64 s2, s[8:9] -; VI-NEXT: s_bcnt1_i32_b64 s3, s[10:11] -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; VI-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; VI-NEXT: s_bcnt1_i32_b64 s5, s[6:7] +; VI-NEXT: s_bcnt1_i32_b64 s6, s[8:9] +; VI-NEXT: s_bcnt1_i32_b64 s7, s[10:11] +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone %truncctpop = trunc <4 x i64> %ctpop to <4 x i32> @@ -220,7 +220,7 @@ define amdgpu_kernel void @s_ctpop_v4i64(ptr addrspace(1) noalias %out, <4 x i64 define amdgpu_kernel void @v_ctpop_v2i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_v2i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -242,7 +242,7 @@ define amdgpu_kernel void @v_ctpop_v2i64(ptr addrspace(1) noalias %out, ptr addr ; ; VI-LABEL: v_ctpop_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -270,7 +270,7 @@ define amdgpu_kernel void @v_ctpop_v2i64(ptr addrspace(1) noalias %out, ptr addr define amdgpu_kernel void @v_ctpop_v4i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_v4i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -298,7 +298,7 @@ define amdgpu_kernel void @v_ctpop_v4i64(ptr addrspace(1) noalias %out, ptr addr ; ; VI-LABEL: v_ctpop_v4i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 5, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -334,11 +334,11 @@ define amdgpu_kernel void @v_ctpop_v4i64(ptr addrspace(1) noalias %out, ptr addr define amdgpu_kernel void @ctpop_i64_in_br(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %ctpop_arg, i32 %cond) { ; SI-LABEL: ctpop_i64_in_br: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s0, s[2:3], 0xf -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0xd +; SI-NEXT: s_load_dword s8, s[0:1], 0xf +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s0, 0 +; SI-NEXT: s_cmp_lg_u32 s8, 0 ; SI-NEXT: s_cbranch_scc0 .LBB7_4 ; SI-NEXT: ; %bb.1: ; %else ; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 @@ -363,11 +363,11 @@ define amdgpu_kernel void @ctpop_i64_in_br(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: ctpop_i64_in_br: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s0, s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x34 +; VI-NEXT: s_load_dword s8, s[0:1], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s0, 0 +; VI-NEXT: s_cmp_lg_u32 s8, 0 ; VI-NEXT: s_cbranch_scc0 .LBB7_4 ; VI-NEXT: ; %bb.1: ; %else ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 @@ -409,8 +409,8 @@ endif: define amdgpu_kernel void @s_ctpop_i128(ptr addrspace(1) noalias %out, i128 %val) nounwind { ; SI-LABEL: s_ctpop_i128: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -423,8 +423,8 @@ define amdgpu_kernel void @s_ctpop_i128(ptr addrspace(1) noalias %out, i128 %val ; ; VI-LABEL: s_ctpop_i128: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -443,8 +443,8 @@ define amdgpu_kernel void @s_ctpop_i128(ptr addrspace(1) noalias %out, i128 %val define amdgpu_kernel void @s_ctpop_i65(ptr addrspace(1) noalias %out, i65 %val) nounwind { ; SI-LABEL: s_ctpop_i65: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -460,8 +460,8 @@ define amdgpu_kernel void @s_ctpop_i65(ptr addrspace(1) noalias %out, i65 %val) ; ; VI-LABEL: s_ctpop_i65: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s8, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -484,7 +484,7 @@ define amdgpu_kernel void @s_ctpop_i65(ptr addrspace(1) noalias %out, i65 %val) define amdgpu_kernel void @v_ctpop_i128(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_i128: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -507,7 +507,7 @@ define amdgpu_kernel void @v_ctpop_i128(ptr addrspace(1) noalias %out, ptr addrs ; ; VI-LABEL: v_ctpop_i128: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 diff --git a/llvm/test/CodeGen/AMDGPU/cttz.ll b/llvm/test/CodeGen/AMDGPU/cttz.ll index 02b0b1cc28fa8..ec532c8e4adc3 100644 --- a/llvm/test/CodeGen/AMDGPU/cttz.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz.ll @@ -22,11 +22,11 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone define amdgpu_kernel void @s_cttz_i32(ptr addrspace(1) noalias %out, i32 %val) nounwind { ; SI-LABEL: s_cttz_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_ff1_i32_b32 s2, s4 +; SI-NEXT: s_ff1_i32_b32 s2, s2 ; SI-NEXT: s_min_u32 s4, s2, 32 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -35,8 +35,8 @@ define amdgpu_kernel void @s_cttz_i32(ptr addrspace(1) noalias %out, i32 %val) n ; ; VI-LABEL: s_cttz_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -61,27 +61,27 @@ define amdgpu_kernel void @s_cttz_i32(ptr addrspace(1) noalias %out, i32 %val) n ; GFX10-LABEL: s_cttz_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_ff1_i32_b32 s2, s4 -; GFX10-NEXT: s_min_u32 s2, s2, 32 -; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_ff1_i32_b32 s0, s4 +; GFX10-NEXT: s_min_u32 s0, s0, 32 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: s_cttz_i32: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: s_ff1_i32_b32 s2, s4 -; GFX10-GISEL-NEXT: s_min_u32 s2, s2, 32 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: s_ff1_i32_b32 s0, s4 +; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 32 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-GISEL-NEXT: s_endpgm %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone store i32 %cttz, ptr addrspace(1) %out, align 4 @@ -91,7 +91,7 @@ define amdgpu_kernel void @s_cttz_i32(ptr addrspace(1) noalias %out, i32 %val) n define amdgpu_kernel void @v_cttz_i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -111,7 +111,7 @@ define amdgpu_kernel void @v_cttz_i32(ptr addrspace(1) noalias %out, ptr addrspa ; ; VI-LABEL: v_cttz_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -148,28 +148,28 @@ define amdgpu_kernel void @v_cttz_i32(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX10-LABEL: v_cttz_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 ; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_cttz_i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid @@ -182,7 +182,7 @@ define amdgpu_kernel void @v_cttz_i32(ptr addrspace(1) noalias %out, ptr addrspa define amdgpu_kernel void @v_cttz_v2i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -204,7 +204,7 @@ define amdgpu_kernel void @v_cttz_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; VI-LABEL: v_cttz_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -246,32 +246,32 @@ define amdgpu_kernel void @v_cttz_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX10-LABEL: v_cttz_v2i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbl_b32_e32 v1, v1 ; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 ; GFX10-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_cttz_v2i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] +; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr <2 x i32>, ptr addrspace(1) %valptr, i32 %tid @@ -284,7 +284,7 @@ define amdgpu_kernel void @v_cttz_v2i32(ptr addrspace(1) noalias %out, ptr addrs define amdgpu_kernel void @v_cttz_v4i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 @@ -310,7 +310,7 @@ define amdgpu_kernel void @v_cttz_v4i32(ptr addrspace(1) noalias %out, ptr addrs ; ; VI-LABEL: v_cttz_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -362,11 +362,11 @@ define amdgpu_kernel void @v_cttz_v4i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX10-LABEL: v_cttz_v4i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] +; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbl_b32_e32 v3, v3 ; GFX10-NEXT: v_ffbl_b32_e32 v2, v2 @@ -376,16 +376,16 @@ define amdgpu_kernel void @v_cttz_v4i32(ptr addrspace(1) noalias %out, ptr addrs ; GFX10-NEXT: v_min_u32_e32 v2, 32, v2 ; GFX10-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 -; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_cttz_v4i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] +; GFX10-GISEL-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 @@ -395,7 +395,7 @@ define amdgpu_kernel void @v_cttz_v4i32(ptr addrspace(1) noalias %out, ptr addrs ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-GISEL-NEXT: v_min_u32_e32 v2, 32, v2 ; GFX10-GISEL-NEXT: v_min_u32_e32 v3, 32, v3 -; GFX10-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX10-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr <4 x i32>, ptr addrspace(1) %valptr, i32 %tid @@ -408,7 +408,7 @@ define amdgpu_kernel void @v_cttz_v4i32(ptr addrspace(1) noalias %out, ptr addrs define amdgpu_kernel void @v_cttz_i8(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -427,7 +427,7 @@ define amdgpu_kernel void @v_cttz_i8(ptr addrspace(1) noalias %out, ptr addrspac ; ; VI-LABEL: v_cttz_i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -475,26 +475,26 @@ define amdgpu_kernel void @v_cttz_i8(ptr addrspace(1) noalias %out, ptr addrspac ; ; GFX10-LABEL: v_cttz_i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX10-NEXT: global_load_ubyte v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_or_b32_e32 v1, 0x100, v1 ; GFX10-NEXT: v_ffbl_b32_e32 v1, v1 -; GFX10-NEXT: global_store_byte v0, v1, s[0:1] +; GFX10-NEXT: global_store_byte v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_cttz_i8: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX10-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_or_b32_e32 v1, 0x100, v1 ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 -; GFX10-GISEL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX10-GISEL-NEXT: global_store_byte v0, v1, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm %val = load i8, ptr addrspace(1) %valptr %cttz = call i8 @llvm.cttz.i8(i8 %val, i1 false) nounwind readnone @@ -505,8 +505,8 @@ define amdgpu_kernel void @v_cttz_i8(ptr addrspace(1) noalias %out, ptr addrspac define amdgpu_kernel void @s_cttz_i64(ptr addrspace(1) noalias %out, [8 x i32], i64 %val) nounwind { ; SI-LABEL: s_cttz_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -519,8 +519,8 @@ define amdgpu_kernel void @s_cttz_i64(ptr addrspace(1) noalias %out, [8 x i32], ; ; VI-LABEL: s_cttz_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v1, 0 @@ -552,11 +552,11 @@ define amdgpu_kernel void @s_cttz_i64(ptr addrspace(1) noalias %out, [8 x i32], ; GFX10-LABEL: s_cttz_i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_ff1_i32_b64 s0, s[0:1] +; GFX10-NEXT: s_ff1_i32_b64 s0, s[2:3] ; GFX10-NEXT: s_min_u32 s0, s0, 64 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] @@ -565,12 +565,12 @@ define amdgpu_kernel void @s_cttz_i64(ptr addrspace(1) noalias %out, [8 x i32], ; GFX10-GISEL-LABEL: s_cttz_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_mov_b32 s1, 0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: s_ff1_i32_b64 s0, s[0:1] -; GFX10-GISEL-NEXT: s_mov_b32 s1, 0 +; GFX10-GISEL-NEXT: s_ff1_i32_b64 s0, s[2:3] ; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 64 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1 @@ -584,7 +584,7 @@ define amdgpu_kernel void @s_cttz_i64(ptr addrspace(1) noalias %out, [8 x i32], define amdgpu_kernel void @s_cttz_i64_trunc(ptr addrspace(1) noalias %out, i64 %val) nounwind { ; SI-LABEL: s_cttz_i64_trunc: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_ff1_i32_b64 s2, s[2:3] @@ -598,7 +598,7 @@ define amdgpu_kernel void @s_cttz_i64_trunc(ptr addrspace(1) noalias %out, i64 % ; ; VI-LABEL: s_cttz_i64_trunc: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -629,24 +629,24 @@ define amdgpu_kernel void @s_cttz_i64_trunc(ptr addrspace(1) noalias %out, i64 % ; ; GFX10-LABEL: s_cttz_i64_trunc: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_ff1_i32_b64 s2, s[2:3] -; GFX10-NEXT: s_min_u32 s2, s2, 64 -; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_ff1_i32_b64 s0, s[6:7] +; GFX10-NEXT: s_min_u32 s0, s0, 64 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: s_cttz_i64_trunc: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: s_ff1_i32_b64 s2, s[2:3] -; GFX10-GISEL-NEXT: s_min_u32 s2, s2, 64 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: s_ff1_i32_b64 s0, s[6:7] +; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 64 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm %cttz = call i64 @llvm.cttz.i64(i64 %val, i1 false) %trunc = trunc i64 %cttz to i32 @@ -657,7 +657,7 @@ define amdgpu_kernel void @s_cttz_i64_trunc(ptr addrspace(1) noalias %out, i64 % define amdgpu_kernel void @v_cttz_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_cttz_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -678,7 +678,7 @@ define amdgpu_kernel void @v_cttz_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; VI-LABEL: v_cttz_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -726,25 +726,25 @@ define amdgpu_kernel void @v_cttz_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX10-LABEL: v_cttz_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbl_b32_e32 v1, v1 ; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 ; GFX10-NEXT: v_add_nc_u32_e64 v1, v1, 32 clamp ; GFX10-NEXT: v_min3_u32 v0, v0, v1, 64 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_cttz_i64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] +; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 @@ -752,7 +752,7 @@ define amdgpu_kernel void @v_cttz_i64(ptr addrspace(1) noalias %out, ptr addrspa ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, v0, v1 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 64, v0 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid @@ -766,7 +766,7 @@ define amdgpu_kernel void @v_cttz_i64(ptr addrspace(1) noalias %out, ptr addrspa define amdgpu_kernel void @v_cttz_i64_trunc(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_cttz_i64_trunc: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 @@ -787,7 +787,7 @@ define amdgpu_kernel void @v_cttz_i64_trunc(ptr addrspace(1) noalias %out, ptr a ; ; VI-LABEL: v_cttz_i64_trunc: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -835,33 +835,33 @@ define amdgpu_kernel void @v_cttz_i64_trunc(ptr addrspace(1) noalias %out, ptr a ; ; GFX10-LABEL: v_cttz_i64_trunc: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[1:2], v1, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[1:2], v1, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbl_b32_e32 v2, v2 ; GFX10-NEXT: v_ffbl_b32_e32 v1, v1 ; GFX10-NEXT: v_add_nc_u32_e64 v2, v2, 32 clamp ; GFX10-NEXT: v_min3_u32 v1, v1, v2, 64 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_cttz_i64_trunc: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dwordx2 v[1:2], v1, s[2:3] +; GFX10-GISEL-NEXT: global_load_dwordx2 v[1:2], v1, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v2, v2 ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 ; GFX10-GISEL-NEXT: v_add_nc_u32_e64 v2, v2, 32 clamp ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, v1, v2 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 64, v1 -; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid @@ -876,7 +876,7 @@ define amdgpu_kernel void @v_cttz_i64_trunc(ptr addrspace(1) noalias %out, ptr a define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_i32_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -895,7 +895,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_cttz_i32_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -933,29 +933,29 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-LABEL: v_cttz_i32_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_cttz_i32_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v0 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc_lo ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid @@ -970,7 +970,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_i32_sel_ne_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -989,7 +989,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_cttz_i32_sel_ne_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1027,29 +1027,29 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-LABEL: v_cttz_i32_sel_ne_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_cttz_i32_sel_ne_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v0 ; GFX10-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v1, vcc_lo ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid @@ -1065,7 +1065,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_cttz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_i32_sel_eq_bitwidth: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1087,7 +1087,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_cttz_i32_sel_eq_bitwidth: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1130,32 +1130,32 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_cttz_i32_sel_eq_bitwidth: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 ; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_cttz_i32_sel_eq_bitwidth: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 32, v0 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid @@ -1170,7 +1170,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias % define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_i32_sel_ne_bitwidth: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1192,7 +1192,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_cttz_i32_sel_ne_bitwidth: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1235,32 +1235,32 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_cttz_i32_sel_ne_bitwidth: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 ; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_cttz_i32_sel_ne_bitwidth: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX10-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 ; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid @@ -1275,7 +1275,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % define amdgpu_kernel void @v_cttz_i8_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_i8_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s10, 0 @@ -1293,7 +1293,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_cttz_i8_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -1335,32 +1335,32 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_cttz_i8_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] +; GFX10-NEXT: global_load_ubyte v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 -; GFX10-NEXT: global_store_byte v1, v0, s[0:1] +; GFX10-NEXT: global_store_byte v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_cttz_i8_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s3 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s7 ; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 ; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_or_b32_e32 v1, 0x100, v0 -; GFX10-GISEL-NEXT: v_cmp_eq_u32_sdwa s2, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-GISEL-NEXT: v_cmp_eq_u32_sdwa s0, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, s2 -; GFX10-GISEL-NEXT: global_store_byte v2, v0, s[0:1] +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, s0 +; GFX10-GISEL-NEXT: global_store_byte v2, v0, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %valptr.gep = getelementptr i8, ptr addrspace(1) %valptr, i32 %tid @@ -1375,7 +1375,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % define amdgpu_kernel void @v_cttz_i16_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_i16_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1393,7 +1393,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_cttz_i16_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1442,31 +1442,31 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_cttz_i16_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_or_b32_e32 v2, 0x10000, v1 ; GFX10-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0, v1 ; GFX10-NEXT: v_ffbl_b32_e32 v2, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo -; GFX10-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_cttz_i16_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX10-GISEL-NEXT: global_load_ushort v1, v0, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_or_b32_e32 v2, 0x10000, v1 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v2, v2 ; GFX10-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v2, 0xffff, vcc_lo -; GFX10-GISEL-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-GISEL-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm %val = load i16, ptr addrspace(1) %valptr %cttz = call i16 @llvm.cttz.i16(i16 %val, i1 false) nounwind readnone @@ -1480,7 +1480,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % define amdgpu_kernel void @v_cttz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_i7_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s10, 0 @@ -1499,7 +1499,7 @@ define amdgpu_kernel void @v_cttz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_cttz_i7_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -1542,23 +1542,23 @@ define amdgpu_kernel void @v_cttz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-LABEL: v_cttz_i7_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] +; GFX10-NEXT: global_load_ubyte v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 ; GFX10-NEXT: v_and_b32_e32 v0, 0x7f, v0 -; GFX10-NEXT: global_store_byte v1, v0, s[0:1] +; GFX10-NEXT: global_store_byte v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_cttz_i7_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s3 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s7 ; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 ; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo ; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off @@ -1570,7 +1570,7 @@ define amdgpu_kernel void @v_cttz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0x7f, vcc_lo ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0x7f, v0 -; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[0:1] +; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %valptr.gep = getelementptr i7, ptr addrspace(1) %valptr, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll index 2491abe4bc1ce..086d99916ba04 100644 --- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll @@ -16,11 +16,11 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone define amdgpu_kernel void @s_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, i32 %val) nounwind { ; SI-LABEL: s_cttz_zero_undef_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_ff1_i32_b32 s4, s4 +; SI-NEXT: s_ff1_i32_b32 s4, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -28,10 +28,10 @@ define amdgpu_kernel void @s_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, ; ; VI-LABEL: s_cttz_zero_undef_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ff1_i32_b32 s2, s4 +; VI-NEXT: s_ff1_i32_b32 s2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -51,13 +51,13 @@ define amdgpu_kernel void @s_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, ; ; GFX9-GISEL-LABEL: s_cttz_zero_undef_i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_ff1_i32_b32 s2, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-GISEL-NEXT: s_ff1_i32_b32 s0, s4 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_endpgm %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone store i32 %cttz, ptr addrspace(1) %out, align 4 @@ -67,7 +67,7 @@ define amdgpu_kernel void @s_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_zero_undef_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -86,7 +86,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_cttz_zero_undef_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -121,14 +121,14 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[6:7] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid @@ -141,7 +141,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_cttz_zero_undef_v2i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_zero_undef_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -161,7 +161,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_v2i32(ptr addrspace(1) noalias %out ; ; VI-LABEL: v_cttz_zero_undef_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -198,15 +198,15 @@ define amdgpu_kernel void @v_cttz_zero_undef_v2i32(ptr addrspace(1) noalias %out ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_v2i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 -; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr <2 x i32>, ptr addrspace(1) %valptr, i32 %tid @@ -219,7 +219,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_v2i32(ptr addrspace(1) noalias %out define amdgpu_kernel void @v_cttz_zero_undef_v4i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_zero_undef_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 @@ -241,7 +241,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_v4i32(ptr addrspace(1) noalias %out ; ; VI-LABEL: v_cttz_zero_undef_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -282,17 +282,17 @@ define amdgpu_kernel void @v_cttz_zero_undef_v4i32(ptr addrspace(1) noalias %out ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_v4i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v2 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v3, v3 -; GFX9-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX9-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr <4 x i32>, ptr addrspace(1) %valptr, i32 %tid @@ -305,11 +305,11 @@ define amdgpu_kernel void @v_cttz_zero_undef_v4i32(ptr addrspace(1) noalias %out define amdgpu_kernel void @s_cttz_zero_undef_i8_with_select(ptr addrspace(1) noalias %out, i8 %val) nounwind { ; SI-LABEL: s_cttz_zero_undef_i8_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_ff1_i32_b32 s4, s4 +; SI-NEXT: s_ff1_i32_b32 s4, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 @@ -317,10 +317,10 @@ define amdgpu_kernel void @s_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa ; ; VI-LABEL: s_cttz_zero_undef_i8_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ff1_i32_b32 s2, s4 +; VI-NEXT: s_ff1_i32_b32 s2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -356,13 +356,13 @@ define amdgpu_kernel void @s_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa ; ; GFX9-GISEL-LABEL: s_cttz_zero_undef_i8_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_ff1_i32_b32 s2, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[0:1] +; GFX9-GISEL-NEXT: s_ff1_i32_b32 s0, s4 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_endpgm %cttz = tail call i8 @llvm.cttz.i8(i8 %val, i1 true) nounwind readnone %cttz_ret = icmp ne i8 %val, 0 @@ -374,11 +374,11 @@ define amdgpu_kernel void @s_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa define amdgpu_kernel void @s_cttz_zero_undef_i16_with_select(ptr addrspace(1) noalias %out, i16 %val) nounwind { ; SI-LABEL: s_cttz_zero_undef_i16_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_ff1_i32_b32 s4, s4 +; SI-NEXT: s_ff1_i32_b32 s4, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -386,10 +386,10 @@ define amdgpu_kernel void @s_cttz_zero_undef_i16_with_select(ptr addrspace(1) no ; ; VI-LABEL: s_cttz_zero_undef_i16_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ff1_i32_b32 s2, s4 +; VI-NEXT: s_ff1_i32_b32 s2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -425,13 +425,13 @@ define amdgpu_kernel void @s_cttz_zero_undef_i16_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: s_cttz_zero_undef_i16_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_ff1_i32_b32 s2, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-GISEL-NEXT: global_store_short v1, v0, s[0:1] +; GFX9-GISEL-NEXT: s_ff1_i32_b32 s0, s4 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-GISEL-NEXT: global_store_short v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_endpgm %cttz = tail call i16 @llvm.cttz.i16(i16 %val, i1 true) nounwind readnone %cttz_ret = icmp ne i16 %val, 0 @@ -443,11 +443,11 @@ define amdgpu_kernel void @s_cttz_zero_undef_i16_with_select(ptr addrspace(1) no define amdgpu_kernel void @s_cttz_zero_undef_i32_with_select(ptr addrspace(1) noalias %out, i32 %val) nounwind { ; SI-LABEL: s_cttz_zero_undef_i32_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_ff1_i32_b32 s4, s4 +; SI-NEXT: s_ff1_i32_b32 s4, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -455,10 +455,10 @@ define amdgpu_kernel void @s_cttz_zero_undef_i32_with_select(ptr addrspace(1) no ; ; VI-LABEL: s_cttz_zero_undef_i32_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ff1_i32_b32 s2, s4 +; VI-NEXT: s_ff1_i32_b32 s2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -478,13 +478,13 @@ define amdgpu_kernel void @s_cttz_zero_undef_i32_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: s_cttz_zero_undef_i32_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_ff1_i32_b32 s2, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-GISEL-NEXT: s_ff1_i32_b32 s0, s4 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_endpgm %cttz = tail call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone %cttz_ret = icmp ne i32 %val, 0 @@ -496,7 +496,7 @@ define amdgpu_kernel void @s_cttz_zero_undef_i32_with_select(ptr addrspace(1) no define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(ptr addrspace(1) noalias %out, i64 %val) nounwind { ; SI-LABEL: s_cttz_zero_undef_i64_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -510,7 +510,7 @@ define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(ptr addrspace(1) no ; ; VI-LABEL: s_cttz_zero_undef_i64_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ff1_i32_b64 s2, s[2:3] @@ -538,14 +538,14 @@ define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: s_cttz_zero_undef_i64_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX9-GISEL-NEXT: s_mov_b32 s5, 0 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_mov_b32 s1, 0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_ff1_i32_b64 s4, s[2:3] -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-GISEL-NEXT: s_ff1_i32_b64 s0, s[6:7] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) nounwind readnone %cttz_ret = icmp ne i64 %val, 0 @@ -557,7 +557,7 @@ define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(ptr addrspace(1) no define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_cttz_zero_undef_i8_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -577,7 +577,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa ; ; VI-LABEL: v_cttz_zero_undef_i8_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -622,16 +622,16 @@ define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i8_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v1 ; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc -; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %val = load i8, ptr addrspace(1) %arrayidx, align 1 %cttz = tail call i8 @llvm.cttz.i8(i8 %val, i1 true) nounwind readnone @@ -644,7 +644,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_cttz_zero_undef_i16_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -668,7 +668,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(ptr addrspace(1) no ; ; VI-LABEL: v_cttz_zero_undef_i16_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 1 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -721,18 +721,18 @@ define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i16_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] -; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 +; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] +; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[6:7] offset:1 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v1 ; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc -; GFX9-GISEL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-GISEL-NEXT: global_store_short v0, v1, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %val = load i16, ptr addrspace(1) %arrayidx, align 1 %cttz = tail call i16 @llvm.cttz.i16(i16 %val, i1 true) nounwind readnone @@ -745,7 +745,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(ptr addrspace(1) no define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_cttz_zero_undef_i32_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -776,7 +776,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) no ; ; VI-LABEL: v_cttz_zero_undef_i32_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 3 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -836,13 +836,13 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i32_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] -; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 -; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[2:3] offset:3 -; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[2:3] offset:2 +; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] +; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[6:7] offset:1 +; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[6:7] offset:3 +; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[6:7] offset:2 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1) @@ -853,7 +853,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) no ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v1 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %val = load i32, ptr addrspace(1) %arrayidx, align 1 %cttz = tail call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone @@ -866,7 +866,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) no define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_cttz_zero_undef_i64_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s10, s2 @@ -913,7 +913,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no ; ; VI-LABEL: v_cttz_zero_undef_i64_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 5 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -1017,17 +1017,17 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i64_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v0, v1, s[2:3] -; GFX9-GISEL-NEXT: global_load_ubyte v2, v1, s[2:3] offset:1 -; GFX9-GISEL-NEXT: global_load_ubyte v3, v1, s[2:3] offset:2 -; GFX9-GISEL-NEXT: global_load_ubyte v4, v1, s[2:3] offset:3 -; GFX9-GISEL-NEXT: global_load_ubyte v5, v1, s[2:3] offset:4 -; GFX9-GISEL-NEXT: global_load_ubyte v6, v1, s[2:3] offset:5 -; GFX9-GISEL-NEXT: global_load_ubyte v7, v1, s[2:3] offset:6 -; GFX9-GISEL-NEXT: global_load_ubyte v8, v1, s[2:3] offset:7 +; GFX9-GISEL-NEXT: global_load_ubyte v0, v1, s[6:7] +; GFX9-GISEL-NEXT: global_load_ubyte v2, v1, s[6:7] offset:1 +; GFX9-GISEL-NEXT: global_load_ubyte v3, v1, s[6:7] offset:2 +; GFX9-GISEL-NEXT: global_load_ubyte v4, v1, s[6:7] offset:3 +; GFX9-GISEL-NEXT: global_load_ubyte v5, v1, s[6:7] offset:4 +; GFX9-GISEL-NEXT: global_load_ubyte v6, v1, s[6:7] offset:5 +; GFX9-GISEL-NEXT: global_load_ubyte v7, v1, s[6:7] offset:6 +; GFX9-GISEL-NEXT: global_load_ubyte v8, v1, s[6:7] offset:7 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(6) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v2, 8, v0 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(5) @@ -1048,7 +1048,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no ; GFX9-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; GFX9-GISEL-NEXT: v_min_u32_e32 v0, v0, v4 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 64, v0, vcc -; GFX9-GISEL-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] +; GFX9-GISEL-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %val = load i64, ptr addrspace(1) %arrayidx, align 1 %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) nounwind readnone @@ -1061,7 +1061,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_cttz_i32_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1091,7 +1091,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_cttz_i32_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 3 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -1152,13 +1152,13 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; GFX9-GISEL-LABEL: v_cttz_i32_sel_eq_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] -; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 -; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[2:3] offset:3 -; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[2:3] offset:2 +; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] +; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[6:7] offset:1 +; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[6:7] offset:3 +; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[6:7] offset:2 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1) @@ -1170,7 +1170,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; GFX9-GISEL-NEXT: v_min_u32_e32 v2, 32, v2 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, v2, -1, vcc -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %val = load i32, ptr addrspace(1) %arrayidx, align 1 %ctlz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone @@ -1183,7 +1183,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_cttz_i32_sel_ne_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1213,7 +1213,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_cttz_i32_sel_ne_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 3 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -1274,13 +1274,13 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; ; GFX9-GISEL-LABEL: v_cttz_i32_sel_ne_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] -; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 -; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[2:3] offset:3 -; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[2:3] offset:2 +; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] +; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[6:7] offset:1 +; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[6:7] offset:3 +; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[6:7] offset:2 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1) @@ -1292,7 +1292,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; GFX9-GISEL-NEXT: v_min_u32_e32 v2, 32, v2 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, -1, v2, vcc -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %val = load i32, ptr addrspace(1) %arrayidx, align 1 %ctlz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone @@ -1305,7 +1305,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_cttz_i32_sel_ne_bitwidth: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1338,7 +1338,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_cttz_i32_sel_ne_bitwidth: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 3 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -1404,13 +1404,13 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX9-GISEL-LABEL: v_cttz_i32_sel_ne_bitwidth: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] -; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 -; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[2:3] offset:3 -; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[2:3] offset:2 +; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] +; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[6:7] offset:1 +; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[6:7] offset:3 +; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[6:7] offset:2 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1) @@ -1422,7 +1422,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; GFX9-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 32, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, -1, v1, vcc -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %val = load i32, ptr addrspace(1) %arrayidx, align 1 %ctlz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone @@ -1435,7 +1435,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % define amdgpu_kernel void @v_cttz_i8_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_cttz_i8_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1453,7 +1453,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_cttz_i8_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1498,18 +1498,18 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX9-GISEL-LABEL: v_cttz_i8_sel_eq_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xff ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_or_b32_e32 v3, 0x100, v1 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v3, v3 ; GFX9-GISEL-NEXT: v_and_b32_e32 v3, 0xff, v3 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc -; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %val = load i8, ptr addrspace(1) %arrayidx, align 1 %ctlz = call i8 @llvm.cttz.i8(i8 %val, i1 false) nounwind readnone @@ -1522,7 +1522,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % define amdgpu_kernel void @v_cttz_i16_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_cttz_i16_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1544,7 +1544,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_cttz_i16_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 1 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -1597,12 +1597,12 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX9-GISEL-LABEL: v_cttz_i16_sel_eq_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] -; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 +; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] +; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[6:7] offset:1 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX9-GISEL-NEXT: v_or_b32_e32 v2, 0x10000, v1 @@ -1610,7 +1610,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-GISEL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-GISEL-NEXT: global_store_short v0, v1, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %val = load i16, ptr addrspace(1) %arrayidx, align 1 %ctlz = call i16 @llvm.cttz.i16(i16 %val, i1 false) nounwind readnone diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll index 96969a12b2c58..4226728dbe118 100644 --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -900,7 +900,7 @@ define double @v_uitofp_i8_to_f64(i8 %arg0) nounwind { define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_i8_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s10, 0 @@ -918,7 +918,7 @@ define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr add ; ; VI-LABEL: load_i8_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -933,30 +933,30 @@ define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr add ; ; GFX10-LABEL: load_i8_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] +; GFX10-NEXT: global_load_ubyte v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: load_i8_to_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] +; GFX9-NEXT: global_load_ubyte v0, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: load_i8_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -976,7 +976,7 @@ define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr add define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v2i8_to_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -996,7 +996,7 @@ define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v2i8_to_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1013,37 +1013,35 @@ define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr ; ; GFX10-LABEL: load_v2i8_to_v2f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ushort v0, v0, s[2:3] +; GFX10-NEXT: global_load_ushort v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: load_v2i8_to_v2f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v0, v0, s[2:3] +; GFX9-NEXT: global_load_ushort v0, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: load_v2i8_to_v2f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1064,7 +1062,7 @@ define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v3i8_to_v3f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -1086,7 +1084,7 @@ define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v3i8_to_v3f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1104,38 +1102,36 @@ define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr ; ; GFX10-LABEL: load_v3i8_to_v3f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX10-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX10-NEXT: global_store_dwordx3 v3, v[0:2], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: load_v3i8_to_v3f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[2:3] +; GFX9-NEXT: global_load_dword v0, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX9-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX9-NEXT: global_store_dwordx3 v3, v[0:2], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: load_v3i8_to_v3f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1157,7 +1153,7 @@ define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v4i8_to_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -1179,7 +1175,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v4i8_to_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1198,41 +1194,39 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr ; ; GFX10-LABEL: load_v4i8_to_v4f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: load_v4i8_to_v4f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[2:3] +; GFX9-NEXT: global_load_dword v0, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 ; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: load_v4i8_to_v4f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v4, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1259,7 +1253,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v4i8_to_v4f32_unaligned: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -1287,7 +1281,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias ; ; VI-LABEL: load_v4i8_to_v4f32_unaligned: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1318,15 +1312,15 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias ; ; GFX10-LABEL: load_v4i8_to_v4f32_unaligned: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x3 -; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] offset:3 -; GFX10-NEXT: global_load_ubyte v2, v0, s[2:3] offset:2 -; GFX10-NEXT: global_load_ubyte v4, v0, s[2:3] offset:1 -; GFX10-NEXT: global_load_ubyte v5, v0, s[2:3] +; GFX10-NEXT: global_load_ubyte v1, v0, s[6:7] offset:3 +; GFX10-NEXT: global_load_ubyte v2, v0, s[6:7] offset:2 +; GFX10-NEXT: global_load_ubyte v4, v0, s[6:7] offset:1 +; GFX10-NEXT: global_load_ubyte v5, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(3) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v3, v1 ; GFX10-NEXT: s_waitcnt vmcnt(2) @@ -1335,19 +1329,19 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v4 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v5 -; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: load_v4i8_to_v4f32_unaligned: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3] offset:3 -; GFX9-NEXT: global_load_ubyte v2, v0, s[2:3] offset:2 -; GFX9-NEXT: global_load_ubyte v4, v0, s[2:3] offset:1 -; GFX9-NEXT: global_load_ubyte v5, v0, s[2:3] +; GFX9-NEXT: global_load_ubyte v1, v0, s[6:7] offset:3 +; GFX9-NEXT: global_load_ubyte v2, v0, s[6:7] offset:2 +; GFX9-NEXT: global_load_ubyte v4, v0, s[6:7] offset:1 +; GFX9-NEXT: global_load_ubyte v5, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, v1 ; GFX9-NEXT: s_waitcnt vmcnt(2) @@ -1356,15 +1350,13 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v5 -; GFX9-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX9-NEXT: global_store_dwordx4 v6, v[0:3], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: load_v4i8_to_v4f32_unaligned: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: global_load_u8 v1, v0, s[2:3] offset:3 @@ -1396,7 +1388,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %out1, ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %in1) nounwind { ; SI-LABEL: load_v4i8_to_v4f32_unaligned_multiuse: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 @@ -1434,7 +1426,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1 ; ; VI-LABEL: load_v4i8_to_v4f32_unaligned_multiuse: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s8, 0x4000405 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1481,15 +1473,15 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1 ; ; GFX10-LABEL: load_v4i8_to_v4f32_unaligned_multiuse: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v7, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x3 -; GFX10-NEXT: global_load_ubyte v1, v0, s[4:5] offset:2 -; GFX10-NEXT: global_load_ubyte v3, v0, s[4:5] offset:3 -; GFX10-NEXT: global_load_ubyte v2, v0, s[6:7] offset:3 -; GFX10-NEXT: global_load_ubyte v4, v0, s[6:7] offset:2 +; GFX10-NEXT: global_load_ubyte v1, v0, s[8:9] offset:2 +; GFX10-NEXT: global_load_ubyte v3, v0, s[8:9] offset:3 +; GFX10-NEXT: global_load_ubyte v2, v0, s[10:11] offset:3 +; GFX10-NEXT: global_load_ubyte v4, v0, s[10:11] offset:2 ; GFX10-NEXT: s_waitcnt vmcnt(2) ; GFX10-NEXT: v_lshl_or_b32 v5, v3, 8, v1 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 @@ -1499,21 +1491,21 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: v_perm_b32 v4, v5, v6, 0x4000405 -; GFX10-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] -; GFX10-NEXT: global_store_dword v7, v4, s[2:3] +; GFX10-NEXT: global_store_dwordx4 v7, v[0:3], s[4:5] +; GFX10-NEXT: global_store_dword v7, v4, s[6:7] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: load_v4i8_to_v4f32_unaligned_multiuse: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: s_mov_b32 s0, 0x4000405 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v1, v0, s[4:5] offset:2 -; GFX9-NEXT: global_load_ubyte v2, v0, s[6:7] offset:3 -; GFX9-NEXT: global_load_ubyte v3, v0, s[4:5] offset:3 -; GFX9-NEXT: global_load_ubyte v4, v0, s[6:7] offset:2 -; GFX9-NEXT: s_mov_b32 s4, 0x4000405 +; GFX9-NEXT: global_load_ubyte v1, v0, s[8:9] offset:2 +; GFX9-NEXT: global_load_ubyte v2, v0, s[10:11] offset:3 +; GFX9-NEXT: global_load_ubyte v3, v0, s[8:9] offset:3 +; GFX9-NEXT: global_load_ubyte v4, v0, s[10:11] offset:2 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshl_or_b32 v6, v3, 8, v1 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 @@ -1522,18 +1514,16 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, v4 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v3 ; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: v_perm_b32 v4, v6, v7, s4 -; GFX9-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] -; GFX9-NEXT: global_store_dword v5, v4, s[2:3] +; GFX9-NEXT: v_perm_b32 v4, v6, v7, s0 +; GFX9-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] +; GFX9-NEXT: global_store_dword v5, v4, s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: load_v4i8_to_v4f32_unaligned_multiuse: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v6, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v6, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: global_load_u8 v1, v0, s[4:5] offset:2 @@ -1573,21 +1563,21 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1 define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %out2, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v4i8_to_v4f32_2_uses: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s2 -; SI-NEXT: s_mov_b32 s5, s3 -; SI-NEXT: s_mov_b32 s2, s6 -; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_mov_b32 s0, s6 +; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; SI-NEXT: v_and_b32_e32 v6, 0xff00, v4 @@ -1596,7 +1586,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v4 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 9, v4 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v0, 0xff, v4 ; SI-NEXT: v_add_i32_e32 v2, vcc, 9, v5 @@ -1609,22 +1599,22 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x9000000, v0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: load_v4i8_to_v4f32_2_uses: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: v_mov_b32_e32 v5, 0xffffff00 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v6, 9 ; VI-NEXT: v_mov_b32_e32 v7, 0x900 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1653,12 +1643,12 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; ; GFX10-LABEL: load_v4i8_to_v4f32_2_uses: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[0:1] -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff00, v0 @@ -1676,22 +1666,21 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX10-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] -; GFX10-NEXT: global_store_dword v4, v5, s[2:3] +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] +; GFX10-NEXT: global_store_dword v4, v5, s[6:7] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: load_v4i8_to_v4f32_2_uses: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_movk_i32 s4, 0xff00 ; GFX9-NEXT: v_mov_b32_e32 v6, 9 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v4, v0, s[0:1] -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX9-NEXT: s_movk_i32 s5, 0x900 +; GFX9-NEXT: s_movk_i32 s0, 0xff00 +; GFX9-NEXT: s_movk_i32 s1, 0x900 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v3, v4 ; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v2, v4 @@ -1699,28 +1688,25 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffffff00, v4 ; GFX9-NEXT: v_add_u16_e32 v8, 9, v4 -; GFX9-NEXT: v_and_b32_sdwa v9, v4, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v9, v4, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_add_u16_sdwa v4, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX9-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_or_b32_sdwa v0, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_or_b32_sdwa v1, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v0, 0x900, v0 -; GFX9-NEXT: v_add_u16_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u16_sdwa v1, v1, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX9-NEXT: global_store_dword v5, v0, s[2:3] +; GFX9-NEXT: global_store_dword v5, v0, s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: load_v4i8_to_v4f32_2_uses: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: v_add_nc_u16 v2, v0, 9 @@ -1767,7 +1753,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v7i8_to_v7f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -1806,7 +1792,7 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v7i8_to_v7f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1851,17 +1837,17 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr ; ; GFX10-LABEL: load_v7i8_to_v7f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x5 -; GFX10-NEXT: global_load_ubyte v4, v0, s[2:3] offset:6 -; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] offset:3 -; GFX10-NEXT: global_load_ubyte v2, v0, s[2:3] offset:2 -; GFX10-NEXT: global_load_ubyte v5, v0, s[2:3] offset:1 -; GFX10-NEXT: global_load_short_d16 v7, v0, s[2:3] offset:4 -; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] +; GFX10-NEXT: global_load_ubyte v4, v0, s[6:7] offset:6 +; GFX10-NEXT: global_load_ubyte v1, v0, s[6:7] offset:3 +; GFX10-NEXT: global_load_ubyte v2, v0, s[6:7] offset:2 +; GFX10-NEXT: global_load_ubyte v5, v0, s[6:7] offset:1 +; GFX10-NEXT: global_load_short_d16 v7, v0, s[6:7] offset:4 +; GFX10-NEXT: global_load_ubyte v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(5) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v6, v4 ; GFX10-NEXT: s_waitcnt vmcnt(4) @@ -1875,22 +1861,22 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v4, v7 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX10-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] offset:16 -; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX10-NEXT: global_store_dwordx3 v8, v[4:6], s[4:5] offset:16 +; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: load_v7i8_to_v7f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v10, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3] offset:6 -; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] offset:4 -; GFX9-NEXT: global_load_ubyte v3, v0, s[2:3] offset:3 -; GFX9-NEXT: global_load_ubyte v7, v0, s[2:3] offset:2 -; GFX9-NEXT: global_load_ubyte v8, v0, s[2:3] offset:1 -; GFX9-NEXT: global_load_ubyte v9, v0, s[2:3] +; GFX9-NEXT: global_load_ubyte v1, v0, s[6:7] offset:6 +; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] offset:4 +; GFX9-NEXT: global_load_ubyte v3, v0, s[6:7] offset:3 +; GFX9-NEXT: global_load_ubyte v7, v0, s[6:7] offset:2 +; GFX9-NEXT: global_load_ubyte v8, v0, s[6:7] offset:1 +; GFX9-NEXT: global_load_ubyte v9, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v6, v1 ; GFX9-NEXT: s_waitcnt vmcnt(4) @@ -1904,17 +1890,15 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, v8 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v9 -; GFX9-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] -; GFX9-NEXT: global_store_dwordx3 v10, v[4:6], s[0:1] offset:16 +; GFX9-NEXT: global_store_dwordx4 v10, v[0:3], s[4:5] +; GFX9-NEXT: global_store_dwordx3 v10, v[4:6], s[4:5] offset:16 ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: load_v7i8_to_v7f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v8, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-NEXT: v_mov_b32_e32 v8, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x5 ; GFX11-NEXT: global_load_u8 v4, v0, s[2:3] offset:6 @@ -1953,7 +1937,7 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v8i8_to_v8f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -1980,7 +1964,7 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v8i8_to_v8f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2004,11 +1988,11 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr ; ; GFX10-LABEL: load_v8i8_to_v8f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: v_mov_b32_e32 v10, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[8:9], v0, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[8:9], v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v7, v9 ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v6, v9 @@ -2018,17 +2002,17 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v8 ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v8 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v8 -; GFX10-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] offset:16 -; GFX10-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] +; GFX10-NEXT: global_store_dwordx4 v10, v[4:7], s[4:5] offset:16 +; GFX10-NEXT: global_store_dwordx4 v10, v[0:3], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: load_v8i8_to_v8f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v9, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[7:8], v0, s[2:3] +; GFX9-NEXT: global_load_dwordx2 v[7:8], v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v3, v7 ; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v2, v7 @@ -2038,17 +2022,15 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr ; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v6, v8 ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v5, v8 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v4, v8 -; GFX9-NEXT: global_store_dwordx4 v9, v[4:7], s[0:1] offset:16 -; GFX9-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1] +; GFX9-NEXT: global_store_dwordx4 v9, v[4:7], s[4:5] offset:16 +; GFX9-NEXT: global_store_dwordx4 v9, v[0:3], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: load_v8i8_to_v8f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v10, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-NEXT: v_mov_b32_e32 v10, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[8:9], v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2077,7 +2059,7 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: i8_zext_inreg_i32_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -2097,7 +2079,7 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %ou ; ; VI-LABEL: i8_zext_inreg_i32_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2114,40 +2096,39 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %ou ; ; GFX10-LABEL: i8_zext_inreg_i32_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v0, 2, v0 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: i8_zext_inreg_i32_to_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[2:3] +; GFX9-NEXT: global_load_dword v0, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v0, 2, v0 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: i8_zext_inreg_i32_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_nc_u32_e32 v0, 2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -2166,7 +2147,7 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %ou define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: i8_zext_inreg_hi1_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -2185,7 +2166,7 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %ou ; ; VI-LABEL: i8_zext_inreg_hi1_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2201,34 +2182,32 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %ou ; ; GFX10-LABEL: i8_zext_inreg_hi1_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: i8_zext_inreg_hi1_to_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[2:3] +; GFX9-NEXT: global_load_dword v0, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: i8_zext_inreg_hi1_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2252,7 +2231,7 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %ou define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: i8_zext_i32_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s10, 0 @@ -2270,7 +2249,7 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: i8_zext_i32_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -2285,30 +2264,30 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr ; ; GFX10-LABEL: i8_zext_i32_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] +; GFX10-NEXT: global_load_ubyte v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: i8_zext_i32_to_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] +; GFX9-NEXT: global_load_ubyte v0, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: i8_zext_i32_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2329,7 +2308,7 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v4i8_zext_v4i32_to_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -2357,7 +2336,7 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou ; ; VI-LABEL: v4i8_zext_v4i32_to_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2388,15 +2367,15 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou ; ; GFX10-LABEL: v4i8_zext_v4i32_to_v4f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x3 -; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] offset:3 -; GFX10-NEXT: global_load_ubyte v2, v0, s[2:3] offset:2 -; GFX10-NEXT: global_load_ubyte v4, v0, s[2:3] offset:1 -; GFX10-NEXT: global_load_ubyte v5, v0, s[2:3] +; GFX10-NEXT: global_load_ubyte v1, v0, s[6:7] offset:3 +; GFX10-NEXT: global_load_ubyte v2, v0, s[6:7] offset:2 +; GFX10-NEXT: global_load_ubyte v4, v0, s[6:7] offset:1 +; GFX10-NEXT: global_load_ubyte v5, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(3) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v3, v1 ; GFX10-NEXT: s_waitcnt vmcnt(2) @@ -2405,19 +2384,19 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v4 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v5 -; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: v4i8_zext_v4i32_to_v4f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3] offset:3 -; GFX9-NEXT: global_load_ubyte v2, v0, s[2:3] offset:2 -; GFX9-NEXT: global_load_ubyte v4, v0, s[2:3] offset:1 -; GFX9-NEXT: global_load_ubyte v5, v0, s[2:3] +; GFX9-NEXT: global_load_ubyte v1, v0, s[6:7] offset:3 +; GFX9-NEXT: global_load_ubyte v2, v0, s[6:7] offset:2 +; GFX9-NEXT: global_load_ubyte v4, v0, s[6:7] offset:1 +; GFX9-NEXT: global_load_ubyte v5, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, v1 ; GFX9-NEXT: s_waitcnt vmcnt(2) @@ -2426,15 +2405,13 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v5 -; GFX9-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX9-NEXT: global_store_dwordx4 v6, v[0:3], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v4i8_zext_v4i32_to_v4f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: global_load_u8 v1, v0, s[2:3] offset:3 @@ -2465,7 +2442,7 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: extract_byte0_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -2484,7 +2461,7 @@ define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, p ; ; VI-LABEL: extract_byte0_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2500,34 +2477,32 @@ define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, p ; ; GFX10-LABEL: extract_byte0_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: extract_byte0_to_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[2:3] +; GFX9-NEXT: global_load_dword v0, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: extract_byte0_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2548,7 +2523,7 @@ define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: extract_byte1_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -2567,7 +2542,7 @@ define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, p ; ; VI-LABEL: extract_byte1_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2583,34 +2558,32 @@ define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, p ; ; GFX10-LABEL: extract_byte1_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: extract_byte1_to_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[2:3] +; GFX9-NEXT: global_load_dword v0, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: extract_byte1_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2632,7 +2605,7 @@ define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: extract_byte2_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -2651,7 +2624,7 @@ define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, p ; ; VI-LABEL: extract_byte2_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2667,34 +2640,32 @@ define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, p ; ; GFX10-LABEL: extract_byte2_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: extract_byte2_to_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[2:3] +; GFX9-NEXT: global_load_dword v0, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: extract_byte2_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2716,7 +2687,7 @@ define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: extract_byte3_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -2735,7 +2706,7 @@ define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, p ; ; VI-LABEL: extract_byte3_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2751,34 +2722,32 @@ define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, p ; ; GFX10-LABEL: extract_byte3_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: extract_byte3_to_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[2:3] +; GFX9-NEXT: global_load_dword v0, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: extract_byte3_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2800,7 +2769,7 @@ define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: cvt_ubyte0_or_multiuse: ; SI: ; %bb.0: ; %bb -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 @@ -2820,7 +2789,7 @@ define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addr ; ; VI-LABEL: cvt_ubyte0_or_multiuse: ; VI: ; %bb.0: ; %bb -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -2840,7 +2809,7 @@ define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addr ; ; GFX10-LABEL: cvt_ubyte0_or_multiuse: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2854,31 +2823,29 @@ define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addr ; ; GFX9-LABEL: cvt_ubyte0_or_multiuse: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[0:1] +; GFX9-NEXT: global_load_dword v0, v0, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_e32 v0, 0x80000001, v0 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, v0 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX9-NEXT: global_store_dword v1, v0, s[2:3] +; GFX9-NEXT: global_store_dword v1, v0, s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: cvt_ubyte0_or_multiuse: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_or_b32_e32 v0, 0x80000001, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX11-NEXT: global_store_b32 v2, v0, s[2:3] ; GFX11-NEXT: s_nop 0 diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll index f298a95c63485..739fff5084135 100644 --- a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll +++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll @@ -7,11 +7,11 @@ define amdgpu_kernel void @uniform_vec_0_i16(ptr addrspace(1) %out, i16 %a) { ; GCN-LABEL: uniform_vec_0_i16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s2, s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s4, s4, 16 +; GCN-NEXT: s_lshl_b32 s4, s2, 16 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -19,33 +19,33 @@ define amdgpu_kernel void @uniform_vec_0_i16(ptr addrspace(1) %out, i16 %a) { ; ; GFX9-LABEL: uniform_vec_0_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s2, s4, 16 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_lshl_b32 s0, s4, 16 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX906-LABEL: uniform_vec_0_i16: ; GFX906: ; %bb.0: -; GFX906-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX906-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: s_lshl_b32 s2, s4, 16 -; GFX906-NEXT: v_mov_b32_e32 v1, s2 -; GFX906-NEXT: global_store_dword v0, v1, s[0:1] +; GFX906-NEXT: s_lshl_b32 s0, s4, 16 +; GFX906-NEXT: v_mov_b32_e32 v1, s0 +; GFX906-NEXT: global_store_dword v0, v1, s[2:3] ; GFX906-NEXT: s_endpgm ; ; GFX11-LABEL: uniform_vec_0_i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshl_b32 s2, s4, 16 +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -92,11 +92,11 @@ define i32 @divergent_vec_0_i16(i16 %a) { define amdgpu_kernel void @uniform_vec_i16_0(ptr addrspace(1) %out, i16 %a) { ; GCN-LABEL: uniform_vec_i16_0: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s2, s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-NEXT: s_and_b32 s4, s2, 0xffff ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -104,33 +104,33 @@ define amdgpu_kernel void @uniform_vec_i16_0(ptr addrspace(1) %out, i16 %a) { ; ; GFX9-LABEL: uniform_vec_i16_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s2, 0xffff, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_and_b32 s0, 0xffff, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX906-LABEL: uniform_vec_i16_0: ; GFX906: ; %bb.0: -; GFX906-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX906-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: s_and_b32 s2, 0xffff, s4 -; GFX906-NEXT: v_mov_b32_e32 v1, s2 -; GFX906-NEXT: global_store_dword v0, v1, s[0:1] +; GFX906-NEXT: s_and_b32 s0, 0xffff, s4 +; GFX906-NEXT: v_mov_b32_e32 v1, s0 +; GFX906-NEXT: global_store_dword v0, v1, s[2:3] ; GFX906-NEXT: s_endpgm ; ; GFX11-LABEL: uniform_vec_i16_0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, 0xffff, s4 +; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -177,11 +177,11 @@ define i32 @divergent_vec_i16_0(i16 %a) { define amdgpu_kernel void @uniform_vec_f16_0(ptr addrspace(1) %out, half %a) { ; GCN-LABEL: uniform_vec_f16_0: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s2, s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-NEXT: s_and_b32 s4, s2, 0xffff ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -189,33 +189,33 @@ define amdgpu_kernel void @uniform_vec_f16_0(ptr addrspace(1) %out, half %a) { ; ; GFX9-LABEL: uniform_vec_f16_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s2, 0xffff, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_and_b32 s0, 0xffff, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX906-LABEL: uniform_vec_f16_0: ; GFX906: ; %bb.0: -; GFX906-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX906-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: s_and_b32 s2, 0xffff, s4 -; GFX906-NEXT: v_mov_b32_e32 v1, s2 -; GFX906-NEXT: global_store_dword v0, v1, s[0:1] +; GFX906-NEXT: s_and_b32 s0, 0xffff, s4 +; GFX906-NEXT: v_mov_b32_e32 v1, s0 +; GFX906-NEXT: global_store_dword v0, v1, s[2:3] ; GFX906-NEXT: s_endpgm ; ; GFX11-LABEL: uniform_vec_f16_0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, 0xffff, s4 +; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -262,7 +262,7 @@ define float @divergent_vec_f16_0(half %a) { define amdgpu_kernel void @uniform_vec_i16_LL(ptr addrspace(4) %in0, ptr addrspace(4) %in1) { ; GCN-LABEL: uniform_vec_i16_LL: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dword s0, s[0:1], 0x0 ; GCN-NEXT: s_load_dword s1, s[2:3], 0x0 @@ -277,7 +277,7 @@ define amdgpu_kernel void @uniform_vec_i16_LL(ptr addrspace(4) %in0, ptr addrspa ; ; GFX9-LABEL: uniform_vec_i16_LL: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0 @@ -290,7 +290,7 @@ define amdgpu_kernel void @uniform_vec_i16_LL(ptr addrspace(4) %in0, ptr addrspa ; ; GFX906-LABEL: uniform_vec_i16_LL: ; GFX906: ; %bb.0: -; GFX906-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX906-NEXT: s_load_dword s5, s[2:3], 0x0 @@ -303,7 +303,7 @@ define amdgpu_kernel void @uniform_vec_i16_LL(ptr addrspace(4) %in0, ptr addrspa ; ; GFX11-LABEL: uniform_vec_i16_LL: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_load_b32 s1, s[2:3], 0x0 @@ -361,7 +361,7 @@ define i32 @divergent_vec_i16_LL(i16 %a, i16 %b) { define amdgpu_kernel void @uniform_vec_i16_LH(ptr addrspace(1) %out, i16 %a, i32 %b) { ; GCN-LABEL: uniform_vec_i16_LH: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_and_b32 s3, s3, 0xffff0000 @@ -376,27 +376,27 @@ define amdgpu_kernel void @uniform_vec_i16_LH(ptr addrspace(1) %out, i16 %a, i32 ; ; GFX9-LABEL: uniform_vec_i16_LH: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_pack_lh_b32_b16 s2, s2, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_pack_lh_b32_b16 s0, s6, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX906-LABEL: uniform_vec_i16_LH: ; GFX906: ; %bb.0: -; GFX906-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: s_pack_lh_b32_b16 s2, s2, s3 -; GFX906-NEXT: v_mov_b32_e32 v1, s2 -; GFX906-NEXT: global_store_dword v0, v1, s[0:1] +; GFX906-NEXT: s_pack_lh_b32_b16 s0, s6, s7 +; GFX906-NEXT: v_mov_b32_e32 v1, s0 +; GFX906-NEXT: global_store_dword v0, v1, s[4:5] ; GFX906-NEXT: s_endpgm ; ; GFX11-LABEL: uniform_vec_i16_LH: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_pack_lh_b32_b16 s2, s2, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -452,7 +452,7 @@ define i32 @divergent_vec_i16_LH(i16 %a, i32 %b) { define amdgpu_kernel void @uniform_vec_i16_HH(ptr addrspace(1) %out, i32 %a, i32 %b) { ; GCN-LABEL: uniform_vec_i16_HH: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -466,27 +466,27 @@ define amdgpu_kernel void @uniform_vec_i16_HH(ptr addrspace(1) %out, i32 %a, i32 ; ; GFX9-LABEL: uniform_vec_i16_HH: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_pack_hh_b32_b16 s2, s2, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_pack_hh_b32_b16 s0, s6, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX906-LABEL: uniform_vec_i16_HH: ; GFX906: ; %bb.0: -; GFX906-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: s_pack_hh_b32_b16 s2, s2, s3 -; GFX906-NEXT: v_mov_b32_e32 v1, s2 -; GFX906-NEXT: global_store_dword v0, v1, s[0:1] +; GFX906-NEXT: s_pack_hh_b32_b16 s0, s6, s7 +; GFX906-NEXT: v_mov_b32_e32 v1, s0 +; GFX906-NEXT: global_store_dword v0, v1, s[4:5] ; GFX906-NEXT: s_endpgm ; ; GFX11-LABEL: uniform_vec_i16_HH: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_pack_hh_b32_b16 s2, s2, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -546,7 +546,7 @@ define i32 @divergent_vec_i16_HH(i32 %a, i32 %b) { define amdgpu_kernel void @uniform_vec_f16_LL(ptr addrspace(4) %in0, ptr addrspace(4) %in1) { ; GCN-LABEL: uniform_vec_f16_LL: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dword s0, s[0:1], 0x0 ; GCN-NEXT: s_load_dword s1, s[2:3], 0x0 @@ -561,7 +561,7 @@ define amdgpu_kernel void @uniform_vec_f16_LL(ptr addrspace(4) %in0, ptr addrspa ; ; GFX9-LABEL: uniform_vec_f16_LL: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0 @@ -574,7 +574,7 @@ define amdgpu_kernel void @uniform_vec_f16_LL(ptr addrspace(4) %in0, ptr addrspa ; ; GFX906-LABEL: uniform_vec_f16_LL: ; GFX906: ; %bb.0: -; GFX906-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX906-NEXT: s_load_dword s5, s[2:3], 0x0 @@ -587,7 +587,7 @@ define amdgpu_kernel void @uniform_vec_f16_LL(ptr addrspace(4) %in0, ptr addrspa ; ; GFX11-LABEL: uniform_vec_f16_LL: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_load_b32 s1, s[2:3], 0x0 @@ -684,10 +684,10 @@ entry: define amdgpu_kernel void @build_vec_v2i16_undeflo_uniform(ptr addrspace(3) %in, ptr addrspace(1) %out) #0 { ; GCN-LABEL: build_vec_v2i16_undeflo_uniform: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s4, s[2:3], 0x9 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; GCN-NEXT: s_load_dword s2, s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: s_mov_b32 m0, -1 ; GCN-NEXT: ds_read_u16 v0, v0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 @@ -698,35 +698,35 @@ define amdgpu_kernel void @build_vec_v2i16_undeflo_uniform(ptr addrspace(3) %in, ; ; GFX9-LABEL: build_vec_v2i16_undeflo_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: ds_read_u16_d16 v0, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: global_store_dword v1, v0, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX906-LABEL: build_vec_v2i16_undeflo_uniform: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dword s4, s[2:3], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GFX906-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX906-NEXT: v_mov_b32_e32 v1, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: v_mov_b32_e32 v0, s4 ; GFX906-NEXT: ds_read_u16 v0, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_store_dword v1, v0, s[0:1] +; GFX906-NEXT: global_store_dword v1, v0, s[2:3] ; GFX906-NEXT: s_endpgm ; ; GFX11-LABEL: build_vec_v2i16_undeflo_uniform: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x2c +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x2c ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-NEXT: ds_load_u16_d16 v0, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll index b72cd7e1d1eca..777a8f3fef1c1 100644 --- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll @@ -15,7 +15,7 @@ define amdgpu_kernel void @simple_read2_f32(ptr addrspace(1) %out) #0 { ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read2_b32 v[1:2], v0 offset1:8 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -28,7 +28,7 @@ define amdgpu_kernel void @simple_read2_f32(ptr addrspace(1) %out) #0 { ; GFX9: ; %bb.0: ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] @@ -51,7 +51,7 @@ define amdgpu_kernel void @simple_read2_f32_max_offset(ptr addrspace(1) %out) #0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read2_b32 v[1:2], v0 offset1:255 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -64,7 +64,7 @@ define amdgpu_kernel void @simple_read2_f32_max_offset(ptr addrspace(1) %out) #0 ; GFX9: ; %bb.0: ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:255 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] @@ -88,7 +88,7 @@ define amdgpu_kernel void @simple_read2_f32_too_far(ptr addrspace(1) %out) #0 { ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read_b32 v1, v0 ; CI-NEXT: ds_read_b32 v2, v0 offset:1028 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -102,7 +102,7 @@ define amdgpu_kernel void @simple_read2_f32_too_far(ptr addrspace(1) %out) #0 { ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: ds_read_b32 v1, v0 ; GFX9-NEXT: ds_read_b32 v2, v0 offset:1028 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] @@ -126,7 +126,7 @@ define amdgpu_kernel void @simple_read2_f32_x2(ptr addrspace(1) %out) #0 { ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read2_b32 v[1:2], v0 offset1:8 ; CI-NEXT: ds_read2_b32 v[3:4], v0 offset0:11 offset1:27 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -142,7 +142,7 @@ define amdgpu_kernel void @simple_read2_f32_x2(ptr addrspace(1) %out) #0 { ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX9-NEXT: ds_read2_b32 v[0:1], v4 offset1:8 ; GFX9-NEXT: ds_read2_b32 v[2:3], v4 offset0:11 offset1:27 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_add_f32_e32 v1, v2, v3 @@ -184,7 +184,7 @@ define amdgpu_kernel void @simple_read2_f32_x2_barrier(ptr addrspace(1) %out) #0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_barrier ; CI-NEXT: ds_read2_b32 v[3:4], v0 offset0:11 offset1:27 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: v_add_f32_e32 v1, v1, v2 ; CI-NEXT: s_mov_b32 s2, 0 @@ -202,7 +202,7 @@ define amdgpu_kernel void @simple_read2_f32_x2_barrier(ptr addrspace(1) %out) #0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_barrier ; GFX9-NEXT: ds_read2_b32 v[2:3], v4 offset0:11 offset1:27 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v1, v2, v3 @@ -245,7 +245,7 @@ define amdgpu_kernel void @simple_read2_f32_x2_nonzero_base(ptr addrspace(1) %ou ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read2_b32 v[1:2], v0 offset0:2 offset1:8 ; CI-NEXT: ds_read2_b32 v[3:4], v0 offset0:11 offset1:27 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -261,7 +261,7 @@ define amdgpu_kernel void @simple_read2_f32_x2_nonzero_base(ptr addrspace(1) %ou ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX9-NEXT: ds_read2_b32 v[0:1], v4 offset0:2 offset1:8 ; GFX9-NEXT: ds_read2_b32 v[2:3], v4 offset0:11 offset1:27 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_add_f32_e32 v1, v2, v3 @@ -301,7 +301,7 @@ define amdgpu_kernel void @simple_read2_f32_x2_nonzero_base(ptr addrspace(1) %ou define amdgpu_kernel void @read2_ptr_is_subreg_arg_f32(ptr addrspace(1) %out, <2 x ptr addrspace(3)> %lds.ptr) #0 { ; CI-LABEL: read2_ptr_is_subreg_arg_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -319,7 +319,7 @@ define amdgpu_kernel void @read2_ptr_is_subreg_arg_f32(ptr addrspace(1) %out, <2 ; ; GFX9-LABEL: read2_ptr_is_subreg_arg_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -352,7 +352,7 @@ define amdgpu_kernel void @read2_ptr_is_subreg_arg_f32(ptr addrspace(1) %out, <2 define amdgpu_kernel void @read2_ptr_is_subreg_arg_offset_f32(ptr addrspace(1) %out, <2 x ptr addrspace(3)> %lds.ptr) #0 { ; CI-LABEL: read2_ptr_is_subreg_arg_offset_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -370,7 +370,7 @@ define amdgpu_kernel void @read2_ptr_is_subreg_arg_offset_f32(ptr addrspace(1) % ; ; GFX9-LABEL: read2_ptr_is_subreg_arg_offset_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -406,7 +406,7 @@ define amdgpu_kernel void @read2_ptr_is_subreg_f32(ptr addrspace(1) %out) #0 { ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read2_b32 v[1:2], v0 offset1:8 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -419,7 +419,7 @@ define amdgpu_kernel void @read2_ptr_is_subreg_f32(ptr addrspace(1) %out) #0 { ; GFX9: ; %bb.0: ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] @@ -449,7 +449,7 @@ define amdgpu_kernel void @simple_read2_f32_volatile_0(ptr addrspace(1) %out) #0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read_b32 v1, v0 ; CI-NEXT: ds_read_b32 v2, v0 offset:32 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -463,7 +463,7 @@ define amdgpu_kernel void @simple_read2_f32_volatile_0(ptr addrspace(1) %out) #0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: ds_read_b32 v1, v0 ; GFX9-NEXT: ds_read_b32 v2, v0 offset:32 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] @@ -487,7 +487,7 @@ define amdgpu_kernel void @simple_read2_f32_volatile_1(ptr addrspace(1) %out) #0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read_b32 v1, v0 ; CI-NEXT: ds_read_b32 v2, v0 offset:32 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -501,7 +501,7 @@ define amdgpu_kernel void @simple_read2_f32_volatile_1(ptr addrspace(1) %out) #0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: ds_read_b32 v1, v0 ; GFX9-NEXT: ds_read_b32 v2, v0 offset:32 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] @@ -522,11 +522,13 @@ define amdgpu_kernel void @simple_read2_f32_volatile_1(ptr addrspace(1) %out) #0 define amdgpu_kernel void @unaligned_read2_f32(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 { ; CI-LABEL: unaligned_read2_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[2:3], 0x2 +; CI-NEXT: s_load_dword s2, s[0:1], 0x2 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_add_i32_e32 v1, vcc, s0, v0 +; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v0 ; CI-NEXT: ds_read_u8 v2, v1 offset:34 ; CI-NEXT: ds_read_u8 v3, v1 offset:32 ; CI-NEXT: ds_read_u8 v4, v1 offset:3 @@ -535,13 +537,15 @@ define amdgpu_kernel void @unaligned_read2_f32(ptr addrspace(1) %out, ptr addrsp ; CI-NEXT: ds_read_u8 v7, v1 ; CI-NEXT: ds_read_u8 v8, v1 offset:33 ; CI-NEXT: ds_read_u8 v1, v1 offset:35 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_waitcnt lgkmcnt(5) ; CI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; CI-NEXT: s_waitcnt lgkmcnt(3) ; CI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; CI-NEXT: v_or_b32_e32 v4, v4, v5 -; CI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; CI-NEXT: s_waitcnt lgkmcnt(1) ; CI-NEXT: v_lshlrev_b32_e32 v5, 8, v8 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; CI-NEXT: v_or_b32_e32 v1, v1, v2 ; CI-NEXT: v_or_b32_e32 v6, v6, v7 ; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 @@ -550,7 +554,6 @@ define amdgpu_kernel void @unaligned_read2_f32(ptr addrspace(1) %out, ptr addrsp ; CI-NEXT: v_or_b32_e32 v4, v4, v6 ; CI-NEXT: v_or_b32_e32 v1, v1, v3 ; CI-NEXT: v_add_f32_e32 v2, v4, v1 -; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 @@ -558,8 +561,8 @@ define amdgpu_kernel void @unaligned_read2_f32(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-ALIGNED-LABEL: unaligned_read2_f32: ; GFX9-ALIGNED: ; %bb.0: -; GFX9-ALIGNED-NEXT: s_load_dword s4, s[2:3], 0x8 -; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-ALIGNED-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-ALIGNED-NEXT: v_add_u32_e32 v1, s4, v0 @@ -582,17 +585,17 @@ define amdgpu_kernel void @unaligned_read2_f32(ptr addrspace(1) %out, ptr addrsp ; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v1, v1, 8, v8 ; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v1, v1, 16, v3 ; GFX9-ALIGNED-NEXT: v_add_f32_e32 v1, v2, v1 -; GFX9-ALIGNED-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-ALIGNED-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-ALIGNED-NEXT: s_endpgm ; ; GFX9-UNALIGNED-LABEL: unaligned_read2_f32: ; GFX9-UNALIGNED: ; %bb.0: -; GFX9-UNALIGNED-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX9-UNALIGNED-NEXT: s_load_dword s2, s[0:1], 0x8 ; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v0, s0, v2 +; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v0, s2, v2 ; GFX9-UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset1:8 -; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-UNALIGNED-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-UNALIGNED-NEXT: global_store_dword v2, v0, s[0:1] @@ -612,11 +615,13 @@ define amdgpu_kernel void @unaligned_read2_f32(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @unaligned_offset_read2_f32(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 { ; CI-LABEL: unaligned_offset_read2_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[2:3], 0x2 +; CI-NEXT: s_load_dword s2, s[0:1], 0x2 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_add_i32_e32 v1, vcc, s0, v0 +; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v0 ; CI-NEXT: ds_read_u8 v2, v1 offset:11 ; CI-NEXT: ds_read_u8 v3, v1 offset:9 ; CI-NEXT: ds_read_u8 v4, v1 offset:8 @@ -625,13 +630,15 @@ define amdgpu_kernel void @unaligned_offset_read2_f32(ptr addrspace(1) %out, ptr ; CI-NEXT: ds_read_u8 v7, v1 offset:5 ; CI-NEXT: ds_read_u8 v8, v1 offset:10 ; CI-NEXT: ds_read_u8 v1, v1 offset:12 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_waitcnt lgkmcnt(5) ; CI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; CI-NEXT: s_waitcnt lgkmcnt(3) ; CI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; CI-NEXT: v_or_b32_e32 v4, v4, v5 -; CI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; CI-NEXT: s_waitcnt lgkmcnt(1) ; CI-NEXT: v_lshlrev_b32_e32 v5, 8, v8 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; CI-NEXT: v_or_b32_e32 v1, v1, v2 ; CI-NEXT: v_or_b32_e32 v6, v6, v7 ; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 @@ -640,7 +647,6 @@ define amdgpu_kernel void @unaligned_offset_read2_f32(ptr addrspace(1) %out, ptr ; CI-NEXT: v_or_b32_e32 v4, v4, v6 ; CI-NEXT: v_or_b32_e32 v1, v1, v3 ; CI-NEXT: v_add_f32_e32 v2, v4, v1 -; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 @@ -648,8 +654,8 @@ define amdgpu_kernel void @unaligned_offset_read2_f32(ptr addrspace(1) %out, ptr ; ; GFX9-ALIGNED-LABEL: unaligned_offset_read2_f32: ; GFX9-ALIGNED: ; %bb.0: -; GFX9-ALIGNED-NEXT: s_load_dword s4, s[2:3], 0x8 -; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-ALIGNED-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-ALIGNED-NEXT: v_add_u32_e32 v1, s4, v0 @@ -672,17 +678,17 @@ define amdgpu_kernel void @unaligned_offset_read2_f32(ptr addrspace(1) %out, ptr ; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v1, v1, 8, v8 ; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v1, v1, 16, v3 ; GFX9-ALIGNED-NEXT: v_add_f32_e32 v1, v2, v1 -; GFX9-ALIGNED-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-ALIGNED-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-ALIGNED-NEXT: s_endpgm ; ; GFX9-UNALIGNED-LABEL: unaligned_offset_read2_f32: ; GFX9-UNALIGNED: ; %bb.0: -; GFX9-UNALIGNED-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX9-UNALIGNED-NEXT: s_load_dword s2, s[0:1], 0x8 ; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v0, s0, v2 +; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v0, s2, v2 ; GFX9-UNALIGNED-NEXT: ds_read_b64 v[0:1], v0 offset:5 -; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-UNALIGNED-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-UNALIGNED-NEXT: global_store_dword v2, v0, s[0:1] @@ -702,41 +708,44 @@ define amdgpu_kernel void @unaligned_offset_read2_f32(ptr addrspace(1) %out, ptr define amdgpu_kernel void @misaligned_2_simple_read2_f32(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 { ; CI-LABEL: misaligned_2_simple_read2_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[2:3], 0x2 +; CI-NEXT: s_load_dword s2, s[0:1], 0x2 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_add_i32_e32 v1, vcc, s0, v0 +; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v0 ; CI-NEXT: ds_read_u16 v2, v1 offset:32 ; CI-NEXT: ds_read_u16 v3, v1 offset:2 ; CI-NEXT: ds_read_u16 v4, v1 ; CI-NEXT: ds_read_u16 v1, v1 offset:34 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: s_waitcnt lgkmcnt(2) ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; CI-NEXT: s_waitcnt lgkmcnt(1) ; CI-NEXT: v_or_b32_e32 v3, v3, v4 +; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v1, v1, v2 ; CI-NEXT: v_add_f32_e32 v2, v3, v1 -; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; CI-NEXT: s_endpgm ; ; GFX9-ALIGNED-LABEL: misaligned_2_simple_read2_f32: ; GFX9-ALIGNED: ; %bb.0: -; GFX9-ALIGNED-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX9-ALIGNED-NEXT: s_load_dword s2, s[0:1], 0x8 ; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-ALIGNED-NEXT: v_add_u32_e32 v1, s0, v0 +; GFX9-ALIGNED-NEXT: v_add_u32_e32 v1, s2, v0 ; GFX9-ALIGNED-NEXT: ds_read_u16 v2, v1 ; GFX9-ALIGNED-NEXT: ds_read_u16 v3, v1 offset:2 ; GFX9-ALIGNED-NEXT: ds_read_u16 v4, v1 offset:32 ; GFX9-ALIGNED-NEXT: ds_read_u16 v1, v1 offset:34 -; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(2) ; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v1, v1, 16, v4 ; GFX9-ALIGNED-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX9-ALIGNED-NEXT: global_store_dword v0, v1, s[0:1] @@ -744,12 +753,12 @@ define amdgpu_kernel void @misaligned_2_simple_read2_f32(ptr addrspace(1) %out, ; ; GFX9-UNALIGNED-LABEL: misaligned_2_simple_read2_f32: ; GFX9-UNALIGNED: ; %bb.0: -; GFX9-UNALIGNED-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX9-UNALIGNED-NEXT: s_load_dword s2, s[0:1], 0x8 ; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v0, s0, v2 +; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v0, s2, v2 ; GFX9-UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset1:8 -; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-UNALIGNED-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-UNALIGNED-NEXT: global_store_dword v2, v0, s[0:1] @@ -772,7 +781,7 @@ define amdgpu_kernel void @simple_read2_f64(ptr addrspace(1) %out) #0 { ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read2_b64 v[0:3], v4 offset1:8 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_mov_b32_e32 v5, 0 @@ -785,7 +794,7 @@ define amdgpu_kernel void @simple_read2_f64(ptr addrspace(1) %out) #0 { ; GFX9: ; %bb.0: ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: ds_read2_b64 v[0:3], v4 offset1:8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] @@ -808,7 +817,7 @@ define amdgpu_kernel void @simple_read2_f64_max_offset(ptr addrspace(1) %out) #0 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read2_b64 v[0:3], v4 offset1:255 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_mov_b32_e32 v5, 0 @@ -821,7 +830,7 @@ define amdgpu_kernel void @simple_read2_f64_max_offset(ptr addrspace(1) %out) #0 ; GFX9: ; %bb.0: ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: ds_read2_b64 v[0:3], v4 offset1:255 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] @@ -845,7 +854,7 @@ define amdgpu_kernel void @simple_read2_f64_too_far(ptr addrspace(1) %out) #0 { ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read_b64 v[1:2], v0 ; CI-NEXT: ds_read_b64 v[3:4], v0 offset:2056 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -859,7 +868,7 @@ define amdgpu_kernel void @simple_read2_f64_too_far(ptr addrspace(1) %out) #0 { ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: ds_read_b64 v[0:1], v4 ; GFX9-NEXT: ds_read_b64 v[2:3], v4 offset:2056 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] @@ -880,15 +889,15 @@ define amdgpu_kernel void @simple_read2_f64_too_far(ptr addrspace(1) %out) #0 { define amdgpu_kernel void @misaligned_read2_f64(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 { ; CI-LABEL: misaligned_read2_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[2:3], 0x2 +; CI-NEXT: s_load_dword s2, s[0:1], 0x2 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_add_i32_e32 v3, vcc, s0, v0 +; CI-NEXT: v_add_i32_e32 v3, vcc, s2, v0 ; CI-NEXT: ds_read2_b32 v[1:2], v3 offset1:1 ; CI-NEXT: ds_read2_b32 v[3:4], v3 offset0:14 offset1:15 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_add_f64 v[2:3], v[1:2], v[3:4] @@ -898,13 +907,13 @@ define amdgpu_kernel void @misaligned_read2_f64(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: misaligned_read2_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v2, s0, v4 +; GFX9-NEXT: v_add_u32_e32 v2, s2, v4 ; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 ; GFX9-NEXT: ds_read2_b32 v[2:3], v2 offset0:14 offset1:15 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] @@ -929,7 +938,7 @@ define amdgpu_kernel void @load_constant_adjacent_offsets(ptr addrspace(1) %out) ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read_b64 v[0:1], v0 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -941,7 +950,7 @@ define amdgpu_kernel void @load_constant_adjacent_offsets(ptr addrspace(1) %out) ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: ds_read_b64 v[0:1], v2 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] @@ -959,7 +968,7 @@ define amdgpu_kernel void @load_constant_disjoint_offsets(ptr addrspace(1) %out) ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read2_b32 v[0:1], v0 offset1:2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -971,7 +980,7 @@ define amdgpu_kernel void @load_constant_disjoint_offsets(ptr addrspace(1) %out) ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:2 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] @@ -991,7 +1000,7 @@ define amdgpu_kernel void @load_misaligned64_constant_offsets(ptr addrspace(1) % ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read_b128 v[0:3], v0 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1004,7 +1013,7 @@ define amdgpu_kernel void @load_misaligned64_constant_offsets(ptr addrspace(1) % ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: ds_read_b128 v[0:3], v4 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc @@ -1026,7 +1035,7 @@ define amdgpu_kernel void @load_misaligned64_constant_large_offsets(ptr addrspac ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read_b64 v[0:1], v2 offset:16384 ; CI-NEXT: ds_read_b64 v[2:3], v2 offset:32760 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1040,7 +1049,7 @@ define amdgpu_kernel void @load_misaligned64_constant_large_offsets(ptr addrspac ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: ds_read_b64 v[0:1], v4 offset:16384 ; GFX9-NEXT: ds_read_b64 v[2:3], v4 offset:32760 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc @@ -1059,11 +1068,12 @@ define amdgpu_kernel void @load_misaligned64_constant_large_offsets(ptr addrspac define amdgpu_kernel void @sgemm_inner_loop_read2_sequence(ptr addrspace(1) %C, i32 %lda, i32 %ldb) #0 { ; CI-LABEL: sgemm_inner_loop_read2_sequence: ; CI: ; %bb.0: -; CI-NEXT: s_lshl_b32 s4, s6, 2 -; CI-NEXT: s_add_i32 s5, s4, 0xc20 -; CI-NEXT: s_addk_i32 s4, 0xc60 -; CI-NEXT: v_mov_b32_e32 v0, s5 -; CI-NEXT: v_mov_b32_e32 v2, s4 +; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; CI-NEXT: s_lshl_b32 s0, s2, 2 +; CI-NEXT: s_add_i32 s1, s0, 0xc20 +; CI-NEXT: s_addk_i32 s0, 0xc60 +; CI-NEXT: v_mov_b32_e32 v0, s1 +; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: v_lshlrev_b32_e32 v8, 2, v1 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 @@ -1071,29 +1081,24 @@ define amdgpu_kernel void @sgemm_inner_loop_read2_sequence(ptr addrspace(1) %C, ; CI-NEXT: ds_read2_b32 v[4:5], v8 offset1:1 ; CI-NEXT: ds_read2_b32 v[6:7], v8 offset0:32 offset1:33 ; CI-NEXT: ds_read2_b32 v[8:9], v8 offset0:64 offset1:65 -; CI-NEXT: s_waitcnt lgkmcnt(4) +; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_add_f32_e32 v0, v0, v1 -; CI-NEXT: s_waitcnt lgkmcnt(3) ; CI-NEXT: v_add_f32_e32 v0, v0, v2 ; CI-NEXT: v_add_f32_e32 v0, v0, v3 -; CI-NEXT: s_waitcnt lgkmcnt(2) ; CI-NEXT: v_add_f32_e32 v0, v0, v4 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; CI-NEXT: v_add_f32_e32 v0, v0, v5 -; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_add_f32_e32 v0, v0, v6 ; CI-NEXT: v_add_f32_e32 v0, v0, v7 ; CI-NEXT: v_add_f32_e32 v0, v0, v8 -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: v_add_f32_e32 v0, v0, v9 -; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; CI-NEXT: s_endpgm ; ; GFX9-LABEL: sgemm_inner_loop_read2_sequence: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX9-NEXT: s_lshl_b32 s2, s6, 2 +; GFX9-NEXT: s_lshl_b32 s2, s2, 2 ; GFX9-NEXT: s_add_i32 s3, s2, 0xc20 ; GFX9-NEXT: s_addk_i32 s2, 0xc60 ; GFX9-NEXT: v_mov_b32_e32 v0, s3 @@ -1104,12 +1109,16 @@ define amdgpu_kernel void @sgemm_inner_loop_read2_sequence(ptr addrspace(1) %C, ; GFX9-NEXT: ds_read2_b32 v[4:5], v8 offset1:1 ; GFX9-NEXT: ds_read2_b32 v[6:7], v8 offset0:32 offset1:33 ; GFX9-NEXT: ds_read2_b32 v[8:9], v8 offset0:64 offset1:65 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_waitcnt lgkmcnt(4) ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(3) ; GFX9-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX9-NEXT: s_waitcnt lgkmcnt(2) ; GFX9-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v5 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v0, v0, v6 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v7 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v8 @@ -1163,28 +1172,28 @@ define amdgpu_kernel void @sgemm_inner_loop_read2_sequence(ptr addrspace(1) %C, define amdgpu_kernel void @misaligned_read2_v2i32(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 { ; CI-LABEL: misaligned_read2_v2i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s4, s[2:3], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dword s2, s[0:1], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s4 +; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 +; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; CI-NEXT: s_endpgm ; ; GFX9-LABEL: misaligned_read2_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm %load = load <2 x i32>, ptr addrspace(3) %in, align 4 store <2 x i32> %load, ptr addrspace(1) %out, align 8 @@ -1194,28 +1203,28 @@ define amdgpu_kernel void @misaligned_read2_v2i32(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @misaligned_read2_i64(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 { ; CI-LABEL: misaligned_read2_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s4, s[2:3], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dword s2, s[0:1], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s4 +; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 +; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; CI-NEXT: s_endpgm ; ; GFX9-LABEL: misaligned_read2_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm %load = load i64, ptr addrspace(3) %in, align 4 store i64 %load, ptr addrspace(1) %out, align 8 @@ -1225,8 +1234,8 @@ define amdgpu_kernel void @misaligned_read2_i64(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @ds_read_diff_base_interleaving( ; CI-LABEL: ds_read_diff_base_interleaving: ; CI: ; %bb.0: ; %bb -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_mov_b32 m0, -1 @@ -1256,10 +1265,10 @@ define amdgpu_kernel void @ds_read_diff_base_interleaving( ; ; GFX9-LABEL: ds_read_diff_base_interleaving: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v2, s4, v1 @@ -1461,7 +1470,7 @@ define amdgpu_kernel void @read2_v2i32_align1_odd_offset(ptr addrspace(1) %out) ; CI-NEXT: ds_read_u8 v6, v0 offset:66 ; CI-NEXT: ds_read_u8 v0, v0 offset:65 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: v_or_b32_e32 v1, v2, v1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v6 @@ -1488,7 +1497,7 @@ define amdgpu_kernel void @read2_v2i32_align1_odd_offset(ptr addrspace(1) %out) ; GFX9-ALIGNED-NEXT: ds_read_u8 v8, v2 offset:71 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(7) ; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-ALIGNED-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v1, 8, v7 @@ -1505,7 +1514,7 @@ define amdgpu_kernel void @read2_v2i32_align1_odd_offset(ptr addrspace(1) %out) ; GFX9-UNALIGNED-LABEL: read2_v2i32_align1_odd_offset: ; GFX9-UNALIGNED: ; %bb.0: ; %entry ; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-UNALIGNED-NEXT: ds_read_b64 v[0:1], v2 offset:65 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-UNALIGNED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll index f4ec16db55d68..70011e56d016e 100644 --- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll @@ -6,7 +6,7 @@ define amdgpu_kernel void @extract_vector_elt_v2f16(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 { ; SI-LABEL: extract_vector_elt_v2f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s4, s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -21,7 +21,7 @@ define amdgpu_kernel void @extract_vector_elt_v2f16(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: extract_vector_elt_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s4, s[2:3], 0x0 ; VI-NEXT: s_mov_b32 s3, 0xf000 @@ -36,7 +36,7 @@ define amdgpu_kernel void @extract_vector_elt_v2f16(ptr addrspace(1) %out, ptr a ; ; GFX11-LABEL: extract_vector_elt_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 @@ -62,8 +62,8 @@ define amdgpu_kernel void @extract_vector_elt_v2f16(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, i32 %idx) #0 { ; SI-LABEL: extract_vector_elt_v2f16_dynamic_sgpr: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s0, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s1, s[6:7], 0x0 ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -77,8 +77,8 @@ define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_sgpr(ptr addrspace(1 ; ; VI-LABEL: extract_vector_elt_v2f16_dynamic_sgpr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s8, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -95,8 +95,8 @@ define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_sgpr(ptr addrspace(1 ; GFX11-LABEL: extract_vector_elt_v2f16_dynamic_sgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0 ; GFX11-NEXT: s_lshl_b32 s0, s0, 4 @@ -119,8 +119,8 @@ define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_sgpr(ptr addrspace(1 define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_vgpr(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, ptr addrspace(1) %idx.ptr) #0 { ; SI-LABEL: extract_vector_elt_v2f16_dynamic_vgpr: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 @@ -139,15 +139,15 @@ define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_vgpr(ptr addrspace(1 ; ; VI-LABEL: extract_vector_elt_v2f16_dynamic_vgpr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s1 -; VI-NEXT: v_add_u32_e32 v1, vcc, s0, v1 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc ; VI-NEXT: flat_load_dword v2, v[1:2] -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_load_dword s1, s[2:3], 0x0 @@ -162,14 +162,12 @@ define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_vgpr(ptr addrspace(1 ; ; GFX11-LABEL: extract_vector_elt_v2f16_dynamic_vgpr: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v1, s[0:1] -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: global_load_b32 v1, v1, s[2:3] +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -195,7 +193,7 @@ define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_vgpr(ptr addrspace(1 define amdgpu_kernel void @extract_vector_elt_v3f16(ptr addrspace(1) %out, <3 x half> %foo) #0 { ; SI-LABEL: extract_vector_elt_v3f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -210,7 +208,7 @@ define amdgpu_kernel void @extract_vector_elt_v3f16(ptr addrspace(1) %out, <3 x ; ; VI-LABEL: extract_vector_elt_v3f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -224,7 +222,7 @@ define amdgpu_kernel void @extract_vector_elt_v3f16(ptr addrspace(1) %out, <3 x ; ; GFX11-LABEL: extract_vector_elt_v3f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -249,8 +247,8 @@ define amdgpu_kernel void @extract_vector_elt_v3f16(ptr addrspace(1) %out, <3 x define amdgpu_kernel void @dynamic_extract_vector_elt_v3f16(ptr addrspace(1) %out, <3 x half> %foo, i32 %idx) #0 { ; SI-LABEL: dynamic_extract_vector_elt_v3f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b32 s4, s4, 4 @@ -264,8 +262,8 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v3f16(ptr addrspace(1) %ou ; ; VI-LABEL: dynamic_extract_vector_elt_v3f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s8, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -280,8 +278,8 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v3f16(ptr addrspace(1) %ou ; GFX11-LABEL: dynamic_extract_vector_elt_v3f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshl_b32 s4, s4, 4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -302,7 +300,7 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v3f16(ptr addrspace(1) %ou define amdgpu_kernel void @v_extractelement_v4f16_2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: v_extractelement_v4f16_2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -319,7 +317,7 @@ define amdgpu_kernel void @v_extractelement_v4f16_2(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: v_extractelement_v4f16_2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -338,9 +336,7 @@ define amdgpu_kernel void @v_extractelement_v4f16_2(ptr addrspace(1) %out, ptr a ; ; GFX11-LABEL: v_extractelement_v4f16_2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -363,7 +359,7 @@ define amdgpu_kernel void @v_extractelement_v4f16_2(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @v_insertelement_v4f16_dynamic_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: v_insertelement_v4f16_dynamic_vgpr: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 @@ -384,7 +380,7 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_vgpr(ptr addrspace(1) % ; ; VI-LABEL: v_insertelement_v4f16_dynamic_vgpr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -407,21 +403,20 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_vgpr(ptr addrspace(1) % ; ; GFX11-LABEL: v_insertelement_v4f16_dynamic_vgpr: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX11-NEXT: buffer_load_b32 v3, off, s[4:7], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] +; GFX11-NEXT: global_load_b64 v[1:2], v1, s[2:3] ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 4, v3 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b64 v[0:1], v3, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v2 -; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX11-NEXT: v_lshrrev_b64 v[1:2], v3, v[1:2] +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -439,7 +434,7 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_vgpr(ptr addrspace(1) % define amdgpu_kernel void @reduce_load_vector_v8f16_extract_01(ptr addrspace(4) %ptr) #0 { ; SI-LABEL: reduce_load_vector_v8f16_extract_01: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s0, s[0:1], 0x0 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -456,7 +451,7 @@ define amdgpu_kernel void @reduce_load_vector_v8f16_extract_01(ptr addrspace(4) ; ; VI-LABEL: reduce_load_vector_v8f16_extract_01: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -473,7 +468,7 @@ define amdgpu_kernel void @reduce_load_vector_v8f16_extract_01(ptr addrspace(4) ; ; GFX11-LABEL: reduce_load_vector_v8f16_extract_01: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -500,7 +495,7 @@ define amdgpu_kernel void @reduce_load_vector_v8f16_extract_01(ptr addrspace(4) define amdgpu_kernel void @reduce_load_vector_v8f16_extract_23(ptr addrspace(4) %ptr) #0 { ; SI-LABEL: reduce_load_vector_v8f16_extract_23: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s0, s[0:1], 0x1 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -517,7 +512,7 @@ define amdgpu_kernel void @reduce_load_vector_v8f16_extract_23(ptr addrspace(4) ; ; VI-LABEL: reduce_load_vector_v8f16_extract_23: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -534,7 +529,7 @@ define amdgpu_kernel void @reduce_load_vector_v8f16_extract_23(ptr addrspace(4) ; ; GFX11-LABEL: reduce_load_vector_v8f16_extract_23: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -561,8 +556,8 @@ define amdgpu_kernel void @reduce_load_vector_v8f16_extract_23(ptr addrspace(4) define amdgpu_kernel void @v_extractelement_v8f16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %n) #0 { ; SI-LABEL: v_extractelement_v8f16_dynamic_sgpr: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 @@ -613,8 +608,8 @@ define amdgpu_kernel void @v_extractelement_v8f16_dynamic_sgpr(ptr addrspace(1) ; ; VI-LABEL: v_extractelement_v8f16_dynamic_sgpr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 4, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -656,46 +651,43 @@ define amdgpu_kernel void @v_extractelement_v8f16_dynamic_sgpr(ptr addrspace(1) ; ; GFX11-LABEL: v_extractelement_v8f16_dynamic_sgpr: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v4, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v4 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 4, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b128 v[0:3], v0, s[6:7] +; GFX11-NEXT: global_load_b128 v[1:4], v1, s[6:7] ; GFX11-NEXT: s_cmp_eq_u32 s0, 1 ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s0, 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GFX11-NEXT: s_cmp_eq_u32 s0, 3 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s0, 4 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s0, 5 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s0, 6 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 1, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v4 ; GFX11-NEXT: s_cmp_eq_u32 s0, 7 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX11-NEXT: global_store_b16 v2, v0, s[4:5] +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX11-NEXT: global_store_b16 v0, v1, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -712,8 +704,8 @@ define amdgpu_kernel void @v_extractelement_v8f16_dynamic_sgpr(ptr addrspace(1) define amdgpu_kernel void @v_extractelement_v16f16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %n) #0 { ; SI-LABEL: v_extractelement_v16f16_dynamic_sgpr: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_lshlrev_b32_e32 v5, 5, v0 @@ -802,8 +794,8 @@ define amdgpu_kernel void @v_extractelement_v16f16_dynamic_sgpr(ptr addrspace(1) ; ; VI-LABEL: v_extractelement_v16f16_dynamic_sgpr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 5, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -877,81 +869,78 @@ define amdgpu_kernel void @v_extractelement_v16f16_dynamic_sgpr(ptr addrspace(1) ; ; GFX11-LABEL: v_extractelement_v16f16_dynamic_sgpr: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v8, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 5, v8 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 5, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b128 v[0:3], v4, s[6:7] -; GFX11-NEXT: global_load_b128 v[4:7], v4, s[6:7] offset:16 +; GFX11-NEXT: global_load_b128 v[1:4], v5, s[6:7] +; GFX11-NEXT: global_load_b128 v[5:8], v5, s[6:7] offset:16 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_cmp_eq_u32 s0, 1 ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s0, 2 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v2 ; GFX11-NEXT: s_cmp_eq_u32 s0, 3 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s0, 4 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s0, 5 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s0, 6 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 1, v8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v4 ; GFX11-NEXT: s_cmp_eq_u32 s0, 7 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s0, 8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v5 ; GFX11-NEXT: s_cmp_eq_u32 s0, 9 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s0, 10 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v6 ; GFX11-NEXT: s_cmp_eq_u32 s0, 11 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s0, 12 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v7 ; GFX11-NEXT: s_cmp_eq_u32 s0, 13 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s0, 14 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v8 ; GFX11-NEXT: s_cmp_eq_u32 s0, 15 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX11-NEXT: global_store_b16 v2, v0, s[4:5] +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX11-NEXT: global_store_b16 v0, v1, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll index 21799ab79b839..f34824cd6cefe 100644 --- a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll @@ -11,8 +11,8 @@ define amdgpu_kernel void @s_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { ; CI-LABEL: s_fabs_free_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s2, s2, 0x7fff ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -23,8 +23,8 @@ define amdgpu_kernel void @s_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { ; ; VI-LABEL: s_fabs_free_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s2, s2, 0x7fff ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -35,8 +35,8 @@ define amdgpu_kernel void @s_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { ; ; GFX9-LABEL: s_fabs_free_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff @@ -47,10 +47,10 @@ define amdgpu_kernel void @s_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { ; GFX11-LABEL: s_fabs_free_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0x7fff +; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -66,8 +66,8 @@ define amdgpu_kernel void @s_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { define amdgpu_kernel void @s_fabs_f16(ptr addrspace(1) %out, half %in) { ; CI-LABEL: s_fabs_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s2, s2, 0x7fff ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -78,8 +78,8 @@ define amdgpu_kernel void @s_fabs_f16(ptr addrspace(1) %out, half %in) { ; ; VI-LABEL: s_fabs_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s2, s2, 0x7fff ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -90,8 +90,8 @@ define amdgpu_kernel void @s_fabs_f16(ptr addrspace(1) %out, half %in) { ; ; GFX9-LABEL: s_fabs_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff @@ -102,10 +102,10 @@ define amdgpu_kernel void @s_fabs_f16(ptr addrspace(1) %out, half %in) { ; GFX11-LABEL: s_fabs_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0x7fff +; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -120,8 +120,8 @@ define amdgpu_kernel void @s_fabs_f16(ptr addrspace(1) %out, half %in) { define amdgpu_kernel void @s_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) { ; CI-LABEL: s_fabs_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -132,8 +132,8 @@ define amdgpu_kernel void @s_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) { ; ; VI-LABEL: s_fabs_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -144,8 +144,8 @@ define amdgpu_kernel void @s_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) { ; ; GFX9-LABEL: s_fabs_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff7fff @@ -156,10 +156,10 @@ define amdgpu_kernel void @s_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) { ; GFX11-LABEL: s_fabs_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0x7fff7fff +; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff7fff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -174,7 +174,7 @@ define amdgpu_kernel void @s_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) { define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) { ; CI-LABEL: s_fabs_v4f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s3, s3, 0x7fff7fff ; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff @@ -187,7 +187,7 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) { ; ; VI-LABEL: s_fabs_v4f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s3, s3, 0x7fff7fff ; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff @@ -200,7 +200,7 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) { ; ; GFX9-LABEL: s_fabs_v4f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s3, s3, 0x7fff7fff @@ -212,7 +212,7 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) { ; ; GFX11-LABEL: s_fabs_v4f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff7fff ; GFX11-NEXT: s_and_b32 s3, s3, 0x7fff7fff @@ -231,12 +231,12 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) { define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half %in1) { ; CI-LABEL: fabs_fold_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[6:7], 0x2 +; CI-NEXT: s_load_dword s0, s[4:5], 0x2 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e64 v0, |s0| ; CI-NEXT: s_lshr_b32 s0, s0, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mul_f32_e32 v0, v0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -247,8 +247,8 @@ define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half ; ; VI-LABEL: fabs_fold_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s3 @@ -260,8 +260,8 @@ define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half ; ; GFX9-LABEL: fabs_fold_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s3, s2, 16 @@ -273,13 +273,13 @@ define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half ; GFX11-LABEL: fabs_fold_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s2, s4, 16 +; GFX11-NEXT: s_lshr_b32 s3, s2, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mul_f16_e64 v1, |s4|, s2 +; GFX11-NEXT: v_mul_f16_e64 v1, |s2|, s3 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -293,7 +293,7 @@ define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half define amdgpu_kernel void @v_fabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: v_fabs_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -307,7 +307,7 @@ define amdgpu_kernel void @v_fabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; VI-LABEL: v_fabs_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -321,7 +321,7 @@ define amdgpu_kernel void @v_fabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: v_fabs_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -332,9 +332,7 @@ define amdgpu_kernel void @v_fabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX11-LABEL: v_fabs_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] @@ -356,8 +354,8 @@ define amdgpu_kernel void @v_fabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @fabs_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { ; CI-LABEL: fabs_free_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -368,8 +366,8 @@ define amdgpu_kernel void @fabs_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { ; ; VI-LABEL: fabs_free_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -380,8 +378,8 @@ define amdgpu_kernel void @fabs_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { ; ; GFX9-LABEL: fabs_free_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff7fff @@ -392,10 +390,10 @@ define amdgpu_kernel void @fabs_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { ; GFX11-LABEL: fabs_free_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0x7fff7fff +; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff7fff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -413,7 +411,7 @@ define amdgpu_kernel void @fabs_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { define amdgpu_kernel void @v_fabs_fold_self_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: v_fabs_fold_self_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -439,7 +437,7 @@ define amdgpu_kernel void @v_fabs_fold_self_v2f16(ptr addrspace(1) %out, ptr add ; ; VI-LABEL: v_fabs_fold_self_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -457,7 +455,7 @@ define amdgpu_kernel void @v_fabs_fold_self_v2f16(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: v_fabs_fold_self_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -470,15 +468,14 @@ define amdgpu_kernel void @v_fabs_fold_self_v2f16(ptr addrspace(1) %out, ptr add ; ; GFX11-LABEL: v_fabs_fold_self_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_mul_f16 v0, v1, v0 ; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -496,8 +493,8 @@ define amdgpu_kernel void @v_fabs_fold_self_v2f16(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @v_fabs_fold_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %other.val) #0 { ; CI-LABEL: v_fabs_fold_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dword s4, s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dword s4, s[4:5], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -524,8 +521,8 @@ define amdgpu_kernel void @v_fabs_fold_v2f16(ptr addrspace(1) %out, ptr addrspac ; ; VI-LABEL: v_fabs_fold_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -545,30 +542,28 @@ define amdgpu_kernel void @v_fabs_fold_v2f16(ptr addrspace(1) %out, ptr addrspac ; ; GFX9-LABEL: v_fabs_fold_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 -; GFX9-NEXT: v_pk_mul_f16 v0, v0, s4 +; GFX9-NEXT: v_pk_mul_f16 v0, v0, s6 ; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_fabs_fold_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x10 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_mul_f16 v0, v0, s0 ; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX11-NEXT: s_nop 0 @@ -587,7 +582,7 @@ define amdgpu_kernel void @v_fabs_fold_v2f16(ptr addrspace(1) %out, ptr addrspac define amdgpu_kernel void @v_extract_fabs_fold_v2f16(ptr addrspace(1) %in) #0 { ; CI-LABEL: v_extract_fabs_fold_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -610,7 +605,7 @@ define amdgpu_kernel void @v_extract_fabs_fold_v2f16(ptr addrspace(1) %in) #0 { ; ; VI-LABEL: v_extract_fabs_fold_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -629,7 +624,7 @@ define amdgpu_kernel void @v_extract_fabs_fold_v2f16(ptr addrspace(1) %in) #0 { ; ; GFX9-LABEL: v_extract_fabs_fold_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x4000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -645,9 +640,7 @@ define amdgpu_kernel void @v_extract_fabs_fold_v2f16(ptr addrspace(1) %in) #0 { ; ; GFX11-LABEL: v_extract_fabs_fold_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] @@ -680,7 +673,7 @@ define amdgpu_kernel void @v_extract_fabs_fold_v2f16(ptr addrspace(1) %in) #0 { define amdgpu_kernel void @v_extract_fabs_no_fold_v2f16(ptr addrspace(1) %in) #0 { ; CI-LABEL: v_extract_fabs_no_fold_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -698,7 +691,7 @@ define amdgpu_kernel void @v_extract_fabs_no_fold_v2f16(ptr addrspace(1) %in) #0 ; ; VI-LABEL: v_extract_fabs_no_fold_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -716,7 +709,7 @@ define amdgpu_kernel void @v_extract_fabs_no_fold_v2f16(ptr addrspace(1) %in) #0 ; ; GFX9-LABEL: v_extract_fabs_no_fold_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v0, s[0:1] @@ -730,9 +723,7 @@ define amdgpu_kernel void @v_extract_fabs_no_fold_v2f16(ptr addrspace(1) %in) #0 ; ; GFX11-LABEL: v_extract_fabs_no_fold_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/fabs.ll b/llvm/test/CodeGen/AMDGPU/fabs.ll index 60e19dcd48f1e..07581ade57ccd 100644 --- a/llvm/test/CodeGen/AMDGPU/fabs.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.ll @@ -39,25 +39,25 @@ define amdgpu_kernel void @s_fabsf_fn_free(ptr addrspace(1) %out, i32 %in) { define amdgpu_kernel void @s_fabsf_free(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: s_fabsf_free: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bitset0_b32 s4, 31 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_bitset0_b32 s0, 31 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_fabsf_free: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_bitset0_b32 s2, 31 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: s_bitset0_b32 s0, 31 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %bc= bitcast i32 %in to float @@ -69,25 +69,25 @@ define amdgpu_kernel void @s_fabsf_free(ptr addrspace(1) %out, i32 %in) { define amdgpu_kernel void @s_fabsf_f32(ptr addrspace(1) %out, float %in) { ; SI-LABEL: s_fabsf_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bitset0_b32 s4, 31 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_bitset0_b32 s0, 31 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_fabsf_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_bitset0_b32 s2, 31 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: s_bitset0_b32 s0, 31 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %fabs = call float @llvm.fabs.f32(float %in) @@ -98,7 +98,7 @@ define amdgpu_kernel void @s_fabsf_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @fabs_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; SI-LABEL: fabs_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -113,7 +113,7 @@ define amdgpu_kernel void @fabs_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; VI-LABEL: fabs_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitset0_b32 s3, 31 ; VI-NEXT: s_bitset0_b32 s2, 31 @@ -131,26 +131,26 @@ define amdgpu_kernel void @fabs_v2f32(ptr addrspace(1) %out, <2 x float> %in) { define amdgpu_kernel void @fabsf_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; SI-LABEL: fabsf_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bitset0_b32 s7, 31 -; SI-NEXT: s_bitset0_b32 s6, 31 -; SI-NEXT: s_bitset0_b32 s5, 31 -; SI-NEXT: s_bitset0_b32 s4, 31 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: s_bitset0_b32 s3, 31 +; SI-NEXT: s_bitset0_b32 s2, 31 +; SI-NEXT: s_bitset0_b32 s1, 31 +; SI-NEXT: s_bitset0_b32 s0, 31 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: v_mov_b32_e32 v3, s3 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fabsf_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_bitset0_b32 s3, 31 @@ -202,7 +202,7 @@ define amdgpu_kernel void @fabsf_fn_fold(ptr addrspace(1) %out, float %in0, floa define amdgpu_kernel void @fabs_fold(ptr addrspace(1) %out, float %in0, float %in1) { ; SI-LABEL: fabs_fold: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -215,7 +215,7 @@ define amdgpu_kernel void @fabs_fold(ptr addrspace(1) %out, float %in0, float %i ; ; VI-LABEL: fabs_fold: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s3 ; VI-NEXT: v_mul_f32_e64 v2, |s2|, v0 @@ -232,23 +232,23 @@ define amdgpu_kernel void @fabs_fold(ptr addrspace(1) %out, float %in0, float %i define amdgpu_kernel void @bitpreserve_fabsf_f32(ptr addrspace(1) %out, float %in) { ; SI-LABEL: bitpreserve_fabsf_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f32_e64 v0, |s4|, 1.0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: v_add_f32_e64 v0, |s0|, 1.0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: bitpreserve_fabsf_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_add_f32_e64 v2, |s2|, 1.0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_add_f32_e64 v2, |s0|, 1.0 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %in.bc = bitcast float %in to i32 diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll index f0ce96af90649..d53c0411ad88c 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll @@ -23,7 +23,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 define amdgpu_kernel void @v_test_canonicalize_var_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: v_test_canonicalize_var_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -35,7 +35,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_f32(ptr addrspace(1) %out) #1 ; ; GFX9-LABEL: v_test_canonicalize_var_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -46,7 +46,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_f32(ptr addrspace(1) %out) #1 ; ; GFX11-LABEL: v_test_canonicalize_var_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] @@ -59,7 +59,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_f32(ptr addrspace(1) %out) #1 ; ; GFX12-LABEL: v_test_canonicalize_var_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[0:1] @@ -78,8 +78,8 @@ define amdgpu_kernel void @v_test_canonicalize_var_f32(ptr addrspace(1) %out) #1 define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, float %val) #1 { ; GFX6-LABEL: s_test_canonicalize_var_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s2, s[6:7], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mul_f32_e64 v2, 1.0, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -89,8 +89,8 @@ define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, fl ; ; GFX8-LABEL: s_test_canonicalize_var_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_f32_e64 v2, 1.0, s2 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -100,8 +100,8 @@ define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, fl ; ; GFX9-LABEL: s_test_canonicalize_var_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_max_f32_e64 v1, s2, s2 @@ -111,11 +111,11 @@ define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, fl ; GFX11-LABEL: s_test_canonicalize_var_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_max_f32_e64 v1, s4, s4 +; GFX11-NEXT: v_max_f32_e64 v1, s2, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -123,7 +123,7 @@ define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, fl ; ; GFX12-LABEL: s_test_canonicalize_var_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x0 +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_max_num_f32_e64 v1, s2, s2 @@ -139,7 +139,7 @@ define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, fl define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: v_test_canonicalize_fabs_var_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -151,7 +151,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_test_canonicalize_fabs_var_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -162,7 +162,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(ptr addrspace(1) %ou ; ; GFX11-LABEL: v_test_canonicalize_fabs_var_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] @@ -175,7 +175,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(ptr addrspace(1) %ou ; ; GFX12-LABEL: v_test_canonicalize_fabs_var_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[0:1] @@ -195,7 +195,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(ptr addrspace(1) %ou define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: v_test_canonicalize_fneg_fabs_var_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -207,7 +207,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(ptr addrspace(1 ; ; GFX9-LABEL: v_test_canonicalize_fneg_fabs_var_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -218,7 +218,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(ptr addrspace(1 ; ; GFX11-LABEL: v_test_canonicalize_fneg_fabs_var_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] @@ -231,7 +231,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(ptr addrspace(1 ; ; GFX12-LABEL: v_test_canonicalize_fneg_fabs_var_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[0:1] @@ -252,7 +252,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(ptr addrspace(1 define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: v_test_canonicalize_fneg_var_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -264,7 +264,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_test_canonicalize_fneg_var_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -275,7 +275,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(ptr addrspace(1) %ou ; ; GFX11-LABEL: v_test_canonicalize_fneg_var_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] @@ -288,7 +288,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(ptr addrspace(1) %ou ; ; GFX12-LABEL: v_test_canonicalize_fneg_var_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[0:1] @@ -308,7 +308,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(ptr addrspace(1) %ou define amdgpu_kernel void @test_fold_canonicalize_undef_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_undef_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -318,7 +318,7 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_f32(ptr addrspace(1) %ou ; ; GFX9-LABEL: test_fold_canonicalize_undef_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v0, v0, s[0:1] @@ -326,7 +326,7 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_f32(ptr addrspace(1) %ou ; ; GFX11-LABEL: test_fold_canonicalize_undef_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v0, s[0:1] @@ -336,7 +336,7 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_f32(ptr addrspace(1) %ou ; ; GFX12-LABEL: test_fold_canonicalize_undef_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v0, s[0:1] @@ -351,7 +351,7 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_f32(ptr addrspace(1) %ou define amdgpu_kernel void @test_fold_canonicalize_p0_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_p0_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -361,7 +361,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f32(ptr addrspace(1) %out) ; ; GFX9-LABEL: test_fold_canonicalize_p0_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v0, v0, s[0:1] @@ -369,7 +369,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f32(ptr addrspace(1) %out) ; ; GFX11-LABEL: test_fold_canonicalize_p0_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v0, s[0:1] @@ -379,7 +379,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f32(ptr addrspace(1) %out) ; ; GFX12-LABEL: test_fold_canonicalize_p0_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v0, s[0:1] @@ -394,7 +394,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f32(ptr addrspace(1) %out) define amdgpu_kernel void @test_fold_canonicalize_n0_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_n0_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -404,7 +404,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f32(ptr addrspace(1) %out) ; ; GFX9-LABEL: test_fold_canonicalize_n0_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -413,7 +413,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f32(ptr addrspace(1) %out) ; ; GFX11-LABEL: test_fold_canonicalize_n0_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -424,7 +424,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f32(ptr addrspace(1) %out) ; ; GFX12-LABEL: test_fold_canonicalize_n0_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -440,7 +440,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f32(ptr addrspace(1) %out) define amdgpu_kernel void @test_fold_canonicalize_p1_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_p1_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 1.0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -450,7 +450,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f32(ptr addrspace(1) %out) ; ; GFX9-LABEL: test_fold_canonicalize_p1_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -459,7 +459,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f32(ptr addrspace(1) %out) ; ; GFX11-LABEL: test_fold_canonicalize_p1_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 1.0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -469,7 +469,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f32(ptr addrspace(1) %out) ; ; GFX12-LABEL: test_fold_canonicalize_p1_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 1.0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -484,7 +484,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f32(ptr addrspace(1) %out) define amdgpu_kernel void @test_fold_canonicalize_n1_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_n1_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, -1.0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -494,7 +494,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f32(ptr addrspace(1) %out) ; ; GFX9-LABEL: test_fold_canonicalize_n1_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, -1.0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -503,7 +503,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f32(ptr addrspace(1) %out) ; ; GFX11-LABEL: test_fold_canonicalize_n1_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, -1.0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -513,7 +513,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f32(ptr addrspace(1) %out) ; ; GFX12-LABEL: test_fold_canonicalize_n1_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, -1.0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -528,7 +528,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f32(ptr addrspace(1) %out) define amdgpu_kernel void @test_fold_canonicalize_literal_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_literal_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x41800000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -538,7 +538,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f32(ptr addrspace(1) % ; ; GFX9-LABEL: test_fold_canonicalize_literal_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x41800000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -547,7 +547,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f32(ptr addrspace(1) % ; ; GFX11-LABEL: test_fold_canonicalize_literal_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x41800000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -557,7 +557,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f32(ptr addrspace(1) % ; ; GFX12-LABEL: test_fold_canonicalize_literal_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x41800000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -572,7 +572,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f32(ptr addrspace(1) % define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -582,7 +582,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(ptr ; ; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v0, v0, s[0:1] @@ -590,7 +590,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(ptr ; ; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v0, s[0:1] @@ -600,7 +600,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(ptr ; ; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v0, s[0:1] @@ -615,7 +615,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(ptr define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dynamic(ptr addrspace(1) %out) #5 { ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: s_mov_b32 s2, 0x7fffff ; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -626,7 +626,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; ; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_mov_b32 s2, 0x7fffff ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_max_f32_e64 v1, s2, s2 @@ -636,7 +636,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; ; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_max_f32_e64 v1, 0x7fffff, 0x7fffff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -647,7 +647,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; ; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: v_max_num_f32_e64 v1, 0x7fffff, 0x7fffff ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -663,7 +663,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_out(ptr addrspace(1) %out) #6 { ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_out: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: s_mov_b32 s2, 0x7fffff ; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -674,7 +674,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; ; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_out: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_mov_b32 s2, 0x7fffff ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_max_f32_e64 v1, s2, s2 @@ -684,7 +684,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; ; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_out: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_max_f32_e64 v1, 0x7fffff, 0x7fffff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -695,7 +695,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; ; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_out: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: v_max_num_f32_e64 v1, 0x7fffff, 0x7fffff ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -711,7 +711,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_in(ptr addrspace(1) %out) #7 { ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_in: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: s_mov_b32 s2, 0x7fffff ; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -722,7 +722,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; ; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_in: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_mov_b32 s2, 0x7fffff ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_max_f32_e64 v1, s2, s2 @@ -732,7 +732,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; ; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_in: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_max_f32_e64 v1, 0x7fffff, 0x7fffff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -743,7 +743,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; ; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_in: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: v_max_num_f32_e64 v1, 0x7fffff, 0x7fffff ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -759,7 +759,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f32(ptr addrspace(1) %out) #3 { ; GFX678-LABEL: test_denormals_fold_canonicalize_denormal0_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fffff ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -769,7 +769,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f32(ptr ad ; ; GFX9-LABEL: test_denormals_fold_canonicalize_denormal0_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -778,7 +778,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f32(ptr ad ; ; GFX11-LABEL: test_denormals_fold_canonicalize_denormal0_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fffff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -788,7 +788,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f32(ptr ad ; ; GFX12-LABEL: test_denormals_fold_canonicalize_denormal0_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fffff ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -803,7 +803,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f32(ptr ad define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal1_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -813,7 +813,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f32(ptr ; ; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal1_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -822,7 +822,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f32(ptr ; ; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal1_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -833,7 +833,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f32(ptr ; ; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal1_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -849,7 +849,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f32(ptr define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f32(ptr addrspace(1) %out) #3 { ; GFX678-LABEL: test_denormals_fold_canonicalize_denormal1_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x807fffff ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -859,7 +859,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f32(ptr ad ; ; GFX9-LABEL: test_denormals_fold_canonicalize_denormal1_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x807fffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -868,7 +868,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f32(ptr ad ; ; GFX11-LABEL: test_denormals_fold_canonicalize_denormal1_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x807fffff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -878,7 +878,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f32(ptr ad ; ; GFX12-LABEL: test_denormals_fold_canonicalize_denormal1_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x807fffff ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -893,7 +893,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f32(ptr ad define amdgpu_kernel void @test_fold_canonicalize_qnan_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_qnan_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -903,7 +903,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f32(ptr addrspace(1) %out ; ; GFX9-LABEL: test_fold_canonicalize_qnan_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -912,7 +912,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f32(ptr addrspace(1) %out ; ; GFX11-LABEL: test_fold_canonicalize_qnan_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -922,7 +922,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f32(ptr addrspace(1) %out ; ; GFX12-LABEL: test_fold_canonicalize_qnan_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -937,7 +937,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f32(ptr addrspace(1) %out define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg1_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -947,7 +947,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f32(ptr addrsp ; ; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg1_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -956,7 +956,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f32(ptr addrsp ; ; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg1_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -966,7 +966,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f32(ptr addrsp ; ; GFX12-LABEL: test_fold_canonicalize_qnan_value_neg1_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -981,7 +981,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f32(ptr addrsp define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg2_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -991,7 +991,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f32(ptr addrsp ; ; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg2_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1000,7 +1000,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f32(ptr addrsp ; ; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg2_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1010,7 +1010,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f32(ptr addrsp ; ; GFX12-LABEL: test_fold_canonicalize_qnan_value_neg2_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1025,7 +1025,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f32(ptr addrsp define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_snan0_value_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -1035,7 +1035,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f32(ptr addrspace( ; ; GFX9-LABEL: test_fold_canonicalize_snan0_value_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1044,7 +1044,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f32(ptr addrspace( ; ; GFX11-LABEL: test_fold_canonicalize_snan0_value_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1054,7 +1054,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f32(ptr addrspace( ; ; GFX12-LABEL: test_fold_canonicalize_snan0_value_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1069,7 +1069,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f32(ptr addrspace( define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_snan1_value_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -1079,7 +1079,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f32(ptr addrspace( ; ; GFX9-LABEL: test_fold_canonicalize_snan1_value_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1088,7 +1088,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f32(ptr addrspace( ; ; GFX11-LABEL: test_fold_canonicalize_snan1_value_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1098,7 +1098,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f32(ptr addrspace( ; ; GFX12-LABEL: test_fold_canonicalize_snan1_value_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1113,7 +1113,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f32(ptr addrspace( define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_snan2_value_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -1123,7 +1123,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f32(ptr addrspace( ; ; GFX9-LABEL: test_fold_canonicalize_snan2_value_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1132,7 +1132,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f32(ptr addrspace( ; ; GFX11-LABEL: test_fold_canonicalize_snan2_value_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1142,7 +1142,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f32(ptr addrspace( ; ; GFX12-LABEL: test_fold_canonicalize_snan2_value_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1157,7 +1157,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f32(ptr addrspace( define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_snan3_value_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -1167,7 +1167,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f32(ptr addrspace( ; ; GFX9-LABEL: test_fold_canonicalize_snan3_value_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1176,7 +1176,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f32(ptr addrspace( ; ; GFX11-LABEL: test_fold_canonicalize_snan3_value_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1186,7 +1186,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f32(ptr addrspace( ; ; GFX12-LABEL: test_fold_canonicalize_snan3_value_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1201,7 +1201,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f32(ptr addrspace( define amdgpu_kernel void @v_test_canonicalize_var_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: v_test_canonicalize_var_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -1213,7 +1213,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_f64(ptr addrspace(1) %out) #1 ; ; GFX9-LABEL: v_test_canonicalize_var_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -1224,7 +1224,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_f64(ptr addrspace(1) %out) #1 ; ; GFX11-LABEL: v_test_canonicalize_var_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -1237,7 +1237,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_f64(ptr addrspace(1) %out) #1 ; ; GFX12-LABEL: v_test_canonicalize_var_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -1256,7 +1256,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_f64(ptr addrspace(1) %out) #1 define amdgpu_kernel void @s_test_canonicalize_var_f64(ptr addrspace(1) %out, double %val) #1 { ; GFX6-LABEL: s_test_canonicalize_var_f64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_max_f64 v[2:3], s[2:3], s[2:3] ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -1266,7 +1266,7 @@ define amdgpu_kernel void @s_test_canonicalize_var_f64(ptr addrspace(1) %out, do ; ; GFX8-LABEL: s_test_canonicalize_var_f64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_max_f64 v[0:1], s[2:3], s[2:3] ; GFX8-NEXT: v_mov_b32_e32 v2, s0 @@ -1276,7 +1276,7 @@ define amdgpu_kernel void @s_test_canonicalize_var_f64(ptr addrspace(1) %out, do ; ; GFX9-LABEL: s_test_canonicalize_var_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_max_f64 v[0:1], s[2:3], s[2:3] @@ -1285,7 +1285,7 @@ define amdgpu_kernel void @s_test_canonicalize_var_f64(ptr addrspace(1) %out, do ; ; GFX11-LABEL: s_test_canonicalize_var_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_max_f64 v[0:1], s[2:3], s[2:3] @@ -1296,7 +1296,7 @@ define amdgpu_kernel void @s_test_canonicalize_var_f64(ptr addrspace(1) %out, do ; ; GFX12-LABEL: s_test_canonicalize_var_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e64 v[0:1], s[2:3], s[2:3] @@ -1312,7 +1312,7 @@ define amdgpu_kernel void @s_test_canonicalize_var_f64(ptr addrspace(1) %out, do define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: v_test_canonicalize_fabs_var_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -1324,7 +1324,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_test_canonicalize_fabs_var_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -1335,7 +1335,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(ptr addrspace(1) %ou ; ; GFX11-LABEL: v_test_canonicalize_fabs_var_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -1348,7 +1348,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(ptr addrspace(1) %ou ; ; GFX12-LABEL: v_test_canonicalize_fabs_var_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -1368,7 +1368,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(ptr addrspace(1) %ou define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: v_test_canonicalize_fneg_fabs_var_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -1380,7 +1380,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(ptr addrspace(1 ; ; GFX9-LABEL: v_test_canonicalize_fneg_fabs_var_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -1391,7 +1391,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(ptr addrspace(1 ; ; GFX11-LABEL: v_test_canonicalize_fneg_fabs_var_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -1404,7 +1404,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(ptr addrspace(1 ; ; GFX12-LABEL: v_test_canonicalize_fneg_fabs_var_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -1425,7 +1425,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(ptr addrspace(1 define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: v_test_canonicalize_fneg_var_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -1437,7 +1437,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_test_canonicalize_fneg_var_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -1448,7 +1448,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(ptr addrspace(1) %ou ; ; GFX11-LABEL: v_test_canonicalize_fneg_var_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -1461,7 +1461,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(ptr addrspace(1) %ou ; ; GFX12-LABEL: v_test_canonicalize_fneg_var_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -1481,7 +1481,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(ptr addrspace(1) %ou define amdgpu_kernel void @test_fold_canonicalize_p0_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_p0_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, v0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1492,7 +1492,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f64(ptr addrspace(1) %out) ; ; GFX9-LABEL: test_fold_canonicalize_p0_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1501,7 +1501,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f64(ptr addrspace(1) %out) ; ; GFX11-LABEL: test_fold_canonicalize_p0_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v1, v0 @@ -1513,7 +1513,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f64(ptr addrspace(1) %out) ; ; GFX12-LABEL: test_fold_canonicalize_p0_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v1, v0 @@ -1530,7 +1530,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f64(ptr addrspace(1) %out) define amdgpu_kernel void @test_fold_canonicalize_n0_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_n0_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1541,7 +1541,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f64(ptr addrspace(1) %out) ; ; GFX9-LABEL: test_fold_canonicalize_n0_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1550,7 +1550,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f64(ptr addrspace(1) %out) ; ; GFX11-LABEL: test_fold_canonicalize_n0_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1561,7 +1561,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f64(ptr addrspace(1) %out) ; ; GFX12-LABEL: test_fold_canonicalize_n0_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1577,7 +1577,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f64(ptr addrspace(1) %out) define amdgpu_kernel void @test_fold_canonicalize_p1_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_p1_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, 0x3ff00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1588,7 +1588,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f64(ptr addrspace(1) %out) ; ; GFX9-LABEL: test_fold_canonicalize_p1_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x3ff00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1597,7 +1597,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f64(ptr addrspace(1) %out) ; ; GFX11-LABEL: test_fold_canonicalize_p1_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1607,7 +1607,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f64(ptr addrspace(1) %out) ; ; GFX12-LABEL: test_fold_canonicalize_p1_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff00000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1622,7 +1622,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f64(ptr addrspace(1) %out) define amdgpu_kernel void @test_fold_canonicalize_n1_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_n1_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, 0xbff00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1633,7 +1633,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f64(ptr addrspace(1) %out) ; ; GFX9-LABEL: test_fold_canonicalize_n1_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xbff00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1642,7 +1642,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f64(ptr addrspace(1) %out) ; ; GFX11-LABEL: test_fold_canonicalize_n1_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xbff00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1652,7 +1652,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f64(ptr addrspace(1) %out) ; ; GFX12-LABEL: test_fold_canonicalize_n1_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xbff00000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1667,7 +1667,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f64(ptr addrspace(1) %out) define amdgpu_kernel void @test_fold_canonicalize_literal_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_literal_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, 0x40300000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1678,7 +1678,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f64(ptr addrspace(1) % ; ; GFX9-LABEL: test_fold_canonicalize_literal_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40300000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1687,7 +1687,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f64(ptr addrspace(1) % ; ; GFX11-LABEL: test_fold_canonicalize_literal_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x40300000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1697,7 +1697,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f64(ptr addrspace(1) % ; ; GFX12-LABEL: test_fold_canonicalize_literal_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x40300000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1712,7 +1712,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f64(ptr addrspace(1) % define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(ptr addrspace(1) %out) #2 { ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, v0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1723,7 +1723,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(ptr ; ; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal0_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1732,7 +1732,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(ptr ; ; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v1, v0 @@ -1744,7 +1744,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(ptr ; ; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal0_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v1, v0 @@ -1761,7 +1761,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(ptr define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(ptr addrspace(1) %out) #3 { ; GFX678-LABEL: test_denormals_fold_canonicalize_denormal0_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, -1 ; GFX678-NEXT: v_mov_b32_e32 v1, 0xfffff ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1772,7 +1772,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(ptr ad ; ; GFX9-LABEL: test_denormals_fold_canonicalize_denormal0_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, -1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xfffff @@ -1782,7 +1782,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(ptr ad ; ; GFX11-LABEL: test_denormals_fold_canonicalize_denormal0_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0xfffff ; GFX11-NEXT: v_mov_b32_e32 v0, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1793,7 +1793,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(ptr ad ; ; GFX12-LABEL: test_denormals_fold_canonicalize_denormal0_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0xfffff ; GFX12-NEXT: v_mov_b32_e32 v0, -1 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1809,7 +1809,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(ptr ad define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(ptr addrspace(1) %out) #2 { ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal1_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1820,7 +1820,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(ptr ; ; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal1_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1829,7 +1829,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(ptr ; ; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal1_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1840,7 +1840,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(ptr ; ; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal1_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1856,7 +1856,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(ptr define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(ptr addrspace(1) %out) #3 { ; GFX678-LABEL: test_denormals_fold_canonicalize_denormal1_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, -1 ; GFX678-NEXT: v_mov_b32_e32 v1, 0x800fffff ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1867,7 +1867,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(ptr ad ; ; GFX9-LABEL: test_denormals_fold_canonicalize_denormal1_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, -1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x800fffff @@ -1877,7 +1877,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(ptr ad ; ; GFX11-LABEL: test_denormals_fold_canonicalize_denormal1_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x800fffff ; GFX11-NEXT: v_mov_b32_e32 v0, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1888,7 +1888,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(ptr ad ; ; GFX12-LABEL: test_denormals_fold_canonicalize_denormal1_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x800fffff ; GFX12-NEXT: v_mov_b32_e32 v0, -1 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1904,7 +1904,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(ptr ad define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_qnan_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1915,7 +1915,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(ptr addrspace(1) %out ; ; GFX9-LABEL: test_fold_canonicalize_qnan_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1924,7 +1924,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(ptr addrspace(1) %out ; ; GFX11-LABEL: test_fold_canonicalize_qnan_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1934,7 +1934,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(ptr addrspace(1) %out ; ; GFX12-LABEL: test_fold_canonicalize_qnan_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1949,7 +1949,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(ptr addrspace(1) %out define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg1_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1960,7 +1960,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(ptr addrsp ; ; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg1_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1969,7 +1969,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(ptr addrsp ; ; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg1_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1979,7 +1979,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(ptr addrsp ; ; GFX12-LABEL: test_fold_canonicalize_qnan_value_neg1_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1994,7 +1994,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(ptr addrsp define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg2_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -2005,7 +2005,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(ptr addrsp ; ; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg2_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2014,7 +2014,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(ptr addrsp ; ; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg2_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -2024,7 +2024,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(ptr addrsp ; ; GFX12-LABEL: test_fold_canonicalize_qnan_value_neg2_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -2039,7 +2039,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(ptr addrsp define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_snan0_value_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -2050,7 +2050,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(ptr addrspace( ; ; GFX9-LABEL: test_fold_canonicalize_snan0_value_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2059,7 +2059,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(ptr addrspace( ; ; GFX11-LABEL: test_fold_canonicalize_snan0_value_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -2069,7 +2069,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(ptr addrspace( ; ; GFX12-LABEL: test_fold_canonicalize_snan0_value_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -2084,7 +2084,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(ptr addrspace( define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_snan1_value_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -2095,7 +2095,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(ptr addrspace( ; ; GFX9-LABEL: test_fold_canonicalize_snan1_value_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2104,7 +2104,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(ptr addrspace( ; ; GFX11-LABEL: test_fold_canonicalize_snan1_value_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -2114,7 +2114,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(ptr addrspace( ; ; GFX12-LABEL: test_fold_canonicalize_snan1_value_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -2129,7 +2129,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(ptr addrspace( define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_snan2_value_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -2140,7 +2140,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(ptr addrspace( ; ; GFX9-LABEL: test_fold_canonicalize_snan2_value_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2149,7 +2149,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(ptr addrspace( ; ; GFX11-LABEL: test_fold_canonicalize_snan2_value_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -2159,7 +2159,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(ptr addrspace( ; ; GFX12-LABEL: test_fold_canonicalize_snan2_value_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -2174,7 +2174,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(ptr addrspace( define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_snan3_value_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -2185,7 +2185,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(ptr addrspace( ; ; GFX9-LABEL: test_fold_canonicalize_snan3_value_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2194,7 +2194,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(ptr addrspace( ; ; GFX11-LABEL: test_fold_canonicalize_snan3_value_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -2204,7 +2204,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(ptr addrspace( ; ; GFX12-LABEL: test_fold_canonicalize_snan3_value_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -2219,7 +2219,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(ptr addrspace( define amdgpu_kernel void @test_canonicalize_value_f64_flush(ptr addrspace(1) %arg, ptr addrspace(1) %out) #4 { ; GFX6-LABEL: test_canonicalize_value_f64_flush: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 @@ -2236,7 +2236,7 @@ define amdgpu_kernel void @test_canonicalize_value_f64_flush(ptr addrspace(1) %a ; ; GFX8-LABEL: test_canonicalize_value_f64_flush: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2253,7 +2253,7 @@ define amdgpu_kernel void @test_canonicalize_value_f64_flush(ptr addrspace(1) %a ; ; GFX9-LABEL: test_canonicalize_value_f64_flush: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -2264,9 +2264,7 @@ define amdgpu_kernel void @test_canonicalize_value_f64_flush(ptr addrspace(1) %a ; ; GFX11-LABEL: test_canonicalize_value_f64_flush: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -2279,9 +2277,7 @@ define amdgpu_kernel void @test_canonicalize_value_f64_flush(ptr addrspace(1) %a ; ; GFX12-LABEL: test_canonicalize_value_f64_flush: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -2303,7 +2299,7 @@ define amdgpu_kernel void @test_canonicalize_value_f64_flush(ptr addrspace(1) %a define amdgpu_kernel void @test_canonicalize_value_f32_flush(ptr addrspace(1) %arg, ptr addrspace(1) %out) #4 { ; GFX6-LABEL: test_canonicalize_value_f32_flush: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 @@ -2320,7 +2316,7 @@ define amdgpu_kernel void @test_canonicalize_value_f32_flush(ptr addrspace(1) %a ; ; GFX8-LABEL: test_canonicalize_value_f32_flush: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2337,7 +2333,7 @@ define amdgpu_kernel void @test_canonicalize_value_f32_flush(ptr addrspace(1) %a ; ; GFX9-LABEL: test_canonicalize_value_f32_flush: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -2348,9 +2344,7 @@ define amdgpu_kernel void @test_canonicalize_value_f32_flush(ptr addrspace(1) %a ; ; GFX11-LABEL: test_canonicalize_value_f32_flush: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] @@ -2363,9 +2357,7 @@ define amdgpu_kernel void @test_canonicalize_value_f32_flush(ptr addrspace(1) %a ; ; GFX12-LABEL: test_canonicalize_value_f32_flush: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[0:1] @@ -2387,7 +2379,7 @@ define amdgpu_kernel void @test_canonicalize_value_f32_flush(ptr addrspace(1) %a define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %arg, ptr addrspace(1) %out) #4 { ; GFX6-LABEL: test_canonicalize_value_f16_flush: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 @@ -2405,7 +2397,7 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a ; ; GFX8-LABEL: test_canonicalize_value_f16_flush: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2422,7 +2414,7 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a ; ; GFX9-LABEL: test_canonicalize_value_f16_flush: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] @@ -2433,9 +2425,7 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a ; ; GFX11-LABEL: test_canonicalize_value_f16_flush: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[0:1] @@ -2448,9 +2438,7 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a ; ; GFX12-LABEL: test_canonicalize_value_f16_flush: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v1, v0, s[0:1] @@ -2473,7 +2461,7 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1) %arg, ptr addrspace(1) %out) #4 { ; GFX6-LABEL: test_canonicalize_value_v2f16_flush: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 @@ -2496,7 +2484,7 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1) ; ; GFX8-LABEL: test_canonicalize_value_v2f16_flush: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2516,7 +2504,7 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1) ; ; GFX9-LABEL: test_canonicalize_value_v2f16_flush: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -2527,9 +2515,7 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1) ; ; GFX11-LABEL: test_canonicalize_value_v2f16_flush: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] @@ -2542,9 +2528,7 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1) ; ; GFX12-LABEL: test_canonicalize_value_v2f16_flush: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[0:1] @@ -2566,7 +2550,7 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1) define amdgpu_kernel void @test_canonicalize_value_f64_denorm(ptr addrspace(1) %arg, ptr addrspace(1) %out) #3 { ; GFX6-LABEL: test_canonicalize_value_f64_denorm: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 @@ -2583,7 +2567,7 @@ define amdgpu_kernel void @test_canonicalize_value_f64_denorm(ptr addrspace(1) % ; ; GFX8-LABEL: test_canonicalize_value_f64_denorm: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2600,7 +2584,7 @@ define amdgpu_kernel void @test_canonicalize_value_f64_denorm(ptr addrspace(1) % ; ; GFX9-LABEL: test_canonicalize_value_f64_denorm: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -2611,9 +2595,7 @@ define amdgpu_kernel void @test_canonicalize_value_f64_denorm(ptr addrspace(1) % ; ; GFX11-LABEL: test_canonicalize_value_f64_denorm: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -2626,9 +2608,7 @@ define amdgpu_kernel void @test_canonicalize_value_f64_denorm(ptr addrspace(1) % ; ; GFX12-LABEL: test_canonicalize_value_f64_denorm: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -2650,7 +2630,7 @@ define amdgpu_kernel void @test_canonicalize_value_f64_denorm(ptr addrspace(1) % define amdgpu_kernel void @test_canonicalize_value_f32_denorm(ptr addrspace(1) %arg, ptr addrspace(1) %out) #3 { ; GFX6-LABEL: test_canonicalize_value_f32_denorm: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 @@ -2667,7 +2647,7 @@ define amdgpu_kernel void @test_canonicalize_value_f32_denorm(ptr addrspace(1) % ; ; GFX8-LABEL: test_canonicalize_value_f32_denorm: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2684,7 +2664,7 @@ define amdgpu_kernel void @test_canonicalize_value_f32_denorm(ptr addrspace(1) % ; ; GFX9-LABEL: test_canonicalize_value_f32_denorm: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -2695,9 +2675,7 @@ define amdgpu_kernel void @test_canonicalize_value_f32_denorm(ptr addrspace(1) % ; ; GFX11-LABEL: test_canonicalize_value_f32_denorm: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] @@ -2710,9 +2688,7 @@ define amdgpu_kernel void @test_canonicalize_value_f32_denorm(ptr addrspace(1) % ; ; GFX12-LABEL: test_canonicalize_value_f32_denorm: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[0:1] @@ -2735,7 +2711,7 @@ define amdgpu_kernel void @test_canonicalize_value_f32_denorm(ptr addrspace(1) % define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) %arg, ptr addrspace(1) %out) #3 { ; GFX6-LABEL: test_canonicalize_value_f16_denorm: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 @@ -2753,7 +2729,7 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) % ; ; GFX8-LABEL: test_canonicalize_value_f16_denorm: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2770,7 +2746,7 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) % ; ; GFX9-LABEL: test_canonicalize_value_f16_denorm: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] @@ -2781,9 +2757,7 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) % ; ; GFX11-LABEL: test_canonicalize_value_f16_denorm: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[0:1] @@ -2796,9 +2770,7 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) % ; ; GFX12-LABEL: test_canonicalize_value_f16_denorm: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v1, v0, s[0:1] @@ -2822,7 +2794,7 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) % define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1) %arg, ptr addrspace(1) %out) #3 { ; GFX6-LABEL: test_canonicalize_value_v2f16_denorm: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 @@ -2845,7 +2817,7 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1) ; ; GFX8-LABEL: test_canonicalize_value_v2f16_denorm: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2864,7 +2836,7 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1) ; ; GFX9-LABEL: test_canonicalize_value_v2f16_denorm: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -2875,9 +2847,7 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1) ; ; GFX11-LABEL: test_canonicalize_value_v2f16_denorm: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] @@ -2890,9 +2860,7 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1) ; ; GFX12-LABEL: test_canonicalize_value_v2f16_denorm: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[0:1] @@ -2914,7 +2882,7 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1) define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out) #1 { ; GFX6-LABEL: v_test_canonicalize_var_v2f64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 @@ -2931,7 +2899,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out) ; ; GFX8-LABEL: v_test_canonicalize_var_v2f64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2948,7 +2916,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out) ; ; GFX9-LABEL: v_test_canonicalize_var_v2f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2961,11 +2929,9 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out) ; ; GFX11-LABEL: v_test_canonicalize_var_v2f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v4, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v0, s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2978,11 +2944,9 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out) ; ; GFX12-LABEL: v_test_canonicalize_var_v2f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b128 v[0:3], v0, s[0:1] ; GFX12-NEXT: s_wait_loadcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll index bd483f4c07071..ae280c5a443e1 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll @@ -15,31 +15,30 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 define amdgpu_kernel void @s_copysign_f16(ptr addrspace(1) %arg_out, half %mag, half %sign) { ; SI-LABEL: s_copysign_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_lshr_b32 s0, s0, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; SI-NEXT: s_lshr_b32 s2, s2, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s2 ; SI-NEXT: s_brev_b32 s2, -2 -; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: v_bfi_b32 v0, s2, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_copysign_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_movk_i32 s2, 0x7fff +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_movk_i32 s3, 0x7fff ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s3, s4, 16 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_bfi_b32 v2, s2, v0, v1 +; VI-NEXT: s_lshr_b32 s4, s2, 16 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_bfi_b32 v2, s3, v0, v1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_short v[0:1], v2 @@ -47,29 +46,29 @@ define amdgpu_kernel void @s_copysign_f16(ptr addrspace(1) %arg_out, half %mag, ; ; GFX9-LABEL: s_copysign_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_movk_i32 s2, 0x7fff +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_movk_i32 s0, 0x7fff ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s3, s4, 16 +; GFX9-NEXT: s_lshr_b32 s1, s4, 16 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NEXT: v_bfi_b32 v1, s2, v1, v2 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_bfi_b32 v1, s0, v1, v2 +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_copysign_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s2, s4, 16 +; GFX11-NEXT: s_lshr_b32 s3, s2, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s4, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s3 +; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s2, v0 ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -82,8 +81,8 @@ define amdgpu_kernel void @s_copysign_f16(ptr addrspace(1) %arg_out, half %mag, define amdgpu_kernel void @s_test_copysign_f16_0(ptr addrspace(1) %out, half %mag) { ; SI-LABEL: s_test_copysign_f16_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -94,10 +93,10 @@ define amdgpu_kernel void @s_test_copysign_f16_0(ptr addrspace(1) %out, half %ma ; ; VI-LABEL: s_test_copysign_f16_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 0x7fff +; VI-NEXT: s_and_b32 s2, s2, 0x7fff ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -106,22 +105,22 @@ define amdgpu_kernel void @s_test_copysign_f16_0(ptr addrspace(1) %out, half %ma ; ; GFX9-LABEL: s_test_copysign_f16_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s2, s4, 0x7fff -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: s_and_b32 s0, s4, 0x7fff +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f16_0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0x7fff +; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -136,8 +135,8 @@ define amdgpu_kernel void @s_test_copysign_f16_0(ptr addrspace(1) %out, half %ma define amdgpu_kernel void @s_test_copysign_f16_1(ptr addrspace(1) %out, half %mag) { ; SI-LABEL: s_test_copysign_f16_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -148,10 +147,10 @@ define amdgpu_kernel void @s_test_copysign_f16_1(ptr addrspace(1) %out, half %ma ; ; VI-LABEL: s_test_copysign_f16_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 0x7fff +; VI-NEXT: s_and_b32 s2, s2, 0x7fff ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -160,22 +159,22 @@ define amdgpu_kernel void @s_test_copysign_f16_1(ptr addrspace(1) %out, half %ma ; ; GFX9-LABEL: s_test_copysign_f16_1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s2, s4, 0x7fff -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: s_and_b32 s0, s4, 0x7fff +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f16_1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0x7fff +; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -190,8 +189,8 @@ define amdgpu_kernel void @s_test_copysign_f16_1(ptr addrspace(1) %out, half %ma define amdgpu_kernel void @s_test_copysign_f16_10.0(ptr addrspace(1) %out, half %mag) { ; SI-LABEL: s_test_copysign_f16_10.0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -202,10 +201,10 @@ define amdgpu_kernel void @s_test_copysign_f16_10.0(ptr addrspace(1) %out, half ; ; VI-LABEL: s_test_copysign_f16_10.0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 0x7fff +; VI-NEXT: s_and_b32 s2, s2, 0x7fff ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -214,22 +213,22 @@ define amdgpu_kernel void @s_test_copysign_f16_10.0(ptr addrspace(1) %out, half ; ; GFX9-LABEL: s_test_copysign_f16_10.0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s2, s4, 0x7fff -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: s_and_b32 s0, s4, 0x7fff +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f16_10.0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0x7fff +; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -244,8 +243,8 @@ define amdgpu_kernel void @s_test_copysign_f16_10.0(ptr addrspace(1) %out, half define amdgpu_kernel void @s_test_copysign_f16_neg1(ptr addrspace(1) %out, half %mag) { ; SI-LABEL: s_test_copysign_f16_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -256,10 +255,10 @@ define amdgpu_kernel void @s_test_copysign_f16_neg1(ptr addrspace(1) %out, half ; ; VI-LABEL: s_test_copysign_f16_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_or_b32 s2, s4, 0x8000 +; VI-NEXT: s_bitset1_b32 s2, 15 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -268,22 +267,22 @@ define amdgpu_kernel void @s_test_copysign_f16_neg1(ptr addrspace(1) %out, half ; ; GFX9-LABEL: s_test_copysign_f16_neg1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_or_b32 s2, s4, 0x8000 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: s_or_b32 s0, s4, 0x8000 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f16_neg1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_or_b32 s2, s4, 0x8000 +; GFX11-NEXT: s_bitset1_b32 s2, 15 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -298,8 +297,8 @@ define amdgpu_kernel void @s_test_copysign_f16_neg1(ptr addrspace(1) %out, half define amdgpu_kernel void @s_test_copysign_f16_neg10(ptr addrspace(1) %out, half %mag) { ; SI-LABEL: s_test_copysign_f16_neg10: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -310,10 +309,10 @@ define amdgpu_kernel void @s_test_copysign_f16_neg10(ptr addrspace(1) %out, half ; ; VI-LABEL: s_test_copysign_f16_neg10: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_or_b32 s2, s4, 0x8000 +; VI-NEXT: s_bitset1_b32 s2, 15 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -322,22 +321,22 @@ define amdgpu_kernel void @s_test_copysign_f16_neg10(ptr addrspace(1) %out, half ; ; GFX9-LABEL: s_test_copysign_f16_neg10: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_or_b32 s2, s4, 0x8000 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: s_or_b32 s0, s4, 0x8000 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f16_neg10: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_or_b32 s2, s4, 0x8000 +; GFX11-NEXT: s_bitset1_b32 s2, 15 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -352,26 +351,25 @@ define amdgpu_kernel void @s_test_copysign_f16_neg10(ptr addrspace(1) %out, half define amdgpu_kernel void @s_test_copysign_f16_0_mag(ptr addrspace(1) %out, half %sign) { ; SI-LABEL: s_test_copysign_f16_0_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; SI-NEXT: s_brev_b32 s2, -2 -; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: v_bfi_b32 v0, s2, 0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_f16_0_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0xffff8000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v2, s4, v0 +; VI-NEXT: v_and_b32_e32 v2, s2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_short v[0:1], v2 @@ -379,23 +377,23 @@ define amdgpu_kernel void @s_test_copysign_f16_0_mag(ptr addrspace(1) %out, half ; ; GFX9-LABEL: s_test_copysign_f16_0_mag: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff8000 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f16_0_mag: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e64 v1, 0xffff8000, s4 +; GFX11-NEXT: v_and_b32_e64 v1, 0xffff8000, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -409,26 +407,25 @@ define amdgpu_kernel void @s_test_copysign_f16_0_mag(ptr addrspace(1) %out, half define amdgpu_kernel void @s_test_copysign_f16_1_mag(ptr addrspace(1) %out, half %sign) { ; SI-LABEL: s_test_copysign_f16_1_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; SI-NEXT: s_brev_b32 s2, -2 -; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: v_bfi_b32 v0, s2, 1.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_f16_1_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0xffff8000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v0, s4, v0 +; VI-NEXT: v_and_b32_e32 v0, s2, v0 ; VI-NEXT: v_or_b32_e32 v2, 0x3c00, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -437,24 +434,24 @@ define amdgpu_kernel void @s_test_copysign_f16_1_mag(ptr addrspace(1) %out, half ; ; GFX9-LABEL: s_test_copysign_f16_1_mag: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff8000 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX9-NEXT: v_or_b32_e32 v1, 0x3c00, v1 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f16_1_mag: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s4 +; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_or_b32_e32 v0, 0x3c00, v0 ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] @@ -469,27 +466,26 @@ define amdgpu_kernel void @s_test_copysign_f16_1_mag(ptr addrspace(1) %out, half define amdgpu_kernel void @s_test_copysign_f16_10_mag(ptr addrspace(1) %out, half %sign) { ; SI-LABEL: s_test_copysign_f16_10_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb +; SI-NEXT: s_load_dword s2, s[0:1], 0xb ; SI-NEXT: v_mov_b32_e32 v1, 0x41200000 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; SI-NEXT: s_brev_b32 s2, -2 -; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: v_bfi_b32 v0, s2, v1, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_f16_10_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0xffff8000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v0, s4, v0 +; VI-NEXT: v_and_b32_e32 v0, s2, v0 ; VI-NEXT: v_or_b32_e32 v2, 0x4900, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -498,24 +494,24 @@ define amdgpu_kernel void @s_test_copysign_f16_10_mag(ptr addrspace(1) %out, hal ; ; GFX9-LABEL: s_test_copysign_f16_10_mag: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff8000 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX9-NEXT: v_or_b32_e32 v1, 0x4900, v1 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f16_10_mag: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s4 +; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_or_b32_e32 v0, 0x4900, v0 ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] @@ -530,26 +526,25 @@ define amdgpu_kernel void @s_test_copysign_f16_10_mag(ptr addrspace(1) %out, hal define amdgpu_kernel void @s_test_copysign_f16_neg1_mag(ptr addrspace(1) %out, half %sign) { ; SI-LABEL: s_test_copysign_f16_neg1_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; SI-NEXT: s_brev_b32 s2, -2 -; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: v_bfi_b32 v0, s2, -1.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_f16_neg1_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0xffff8000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v0, s4, v0 +; VI-NEXT: v_and_b32_e32 v0, s2, v0 ; VI-NEXT: v_or_b32_e32 v2, 0x3c00, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -558,24 +553,24 @@ define amdgpu_kernel void @s_test_copysign_f16_neg1_mag(ptr addrspace(1) %out, h ; ; GFX9-LABEL: s_test_copysign_f16_neg1_mag: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff8000 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX9-NEXT: v_or_b32_e32 v1, 0x3c00, v1 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f16_neg1_mag: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s4 +; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_or_b32_e32 v0, 0x3c00, v0 ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] @@ -590,27 +585,26 @@ define amdgpu_kernel void @s_test_copysign_f16_neg1_mag(ptr addrspace(1) %out, h define amdgpu_kernel void @s_test_copysign_f16_neg10_mag(ptr addrspace(1) %out, half %sign) { ; SI-LABEL: s_test_copysign_f16_neg10_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb +; SI-NEXT: s_load_dword s2, s[0:1], 0xb ; SI-NEXT: v_mov_b32_e32 v1, 0xc1200000 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; SI-NEXT: s_brev_b32 s2, -2 -; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: v_bfi_b32 v0, s2, v1, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_f16_neg10_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0xffff8000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v0, s4, v0 +; VI-NEXT: v_and_b32_e32 v0, s2, v0 ; VI-NEXT: v_or_b32_e32 v2, 0x4900, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -619,24 +613,24 @@ define amdgpu_kernel void @s_test_copysign_f16_neg10_mag(ptr addrspace(1) %out, ; ; GFX9-LABEL: s_test_copysign_f16_neg10_mag: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff8000 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX9-NEXT: v_or_b32_e32 v1, 0x4900, v1 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f16_neg10_mag: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s4 +; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_or_b32_e32 v0, 0x4900, v0 ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] @@ -829,8 +823,8 @@ define half @v_test_copysign_f16_neg10(half %mag) { define amdgpu_kernel void @v_copysign_out_f32_mag_f16_sign_f32(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) { ; SI-LABEL: v_copysign_out_f32_mag_f16_sign_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: s_mov_b32 s15, s11 @@ -855,8 +849,8 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f16_sign_f32(ptr addrspace(1) ; ; VI-LABEL: v_copysign_out_f32_mag_f16_sign_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -880,17 +874,17 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f16_sign_f32(ptr addrspace(1) ; ; GFX9-LABEL: v_copysign_out_f32_mag_f16_sign_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_brev_b32 s0, -2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v1, s[6:7] -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dword v0, v0, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX9-NEXT: global_load_dword v0, v0, s[0:1] -; GFX9-NEXT: s_brev_b32 s0, -2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_bfi_b32 v0, s0, v1, v0 ; GFX9-NEXT: global_store_dword v2, v0, s[4:5] @@ -899,10 +893,8 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f16_sign_f32(ptr addrspace(1) ; GFX11-LABEL: v_copysign_out_f32_mag_f16_sign_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 1, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -931,8 +923,8 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f16_sign_f32(ptr addrspace(1) define amdgpu_kernel void @v_copysign_out_f64_mag_f16_sign_f64(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) { ; SI-LABEL: v_copysign_out_f64_mag_f16_sign_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: s_mov_b32 s15, s11 @@ -957,8 +949,8 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f16_sign_f64(ptr addrspace(1) ; ; VI-LABEL: v_copysign_out_f64_mag_f16_sign_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -982,15 +974,15 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f16_sign_f64(ptr addrspace(1) ; ; GFX9-LABEL: v_copysign_out_f64_mag_f16_sign_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX9-NEXT: s_brev_b32 s0, -2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v2, v1, s[6:7] ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] -; GFX9-NEXT: s_brev_b32 s0, -2 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v2 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[2:3], v0 @@ -1001,12 +993,9 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f16_sign_f64(ptr addrspace(1) ; ; GFX11-LABEL: v_copysign_out_f64_mag_f16_sign_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v2, v1, s[6:7] @@ -1035,8 +1024,8 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f16_sign_f64(ptr addrspace(1) define amdgpu_kernel void @v_copysign_out_f32_mag_f32_sign_f16(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) { ; SI-LABEL: v_copysign_out_f32_mag_f32_sign_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: s_mov_b32 s15, s11 @@ -1060,8 +1049,8 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f32_sign_f16(ptr addrspace(1) ; ; VI-LABEL: v_copysign_out_f32_mag_f32_sign_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1085,14 +1074,14 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f32_sign_f16(ptr addrspace(1) ; ; GFX9-LABEL: v_copysign_out_f32_mag_f32_sign_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v1, s[0:1] ; GFX9-NEXT: s_brev_b32 s0, -2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_ushort v1, v1, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dword v0, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -1104,10 +1093,8 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f32_sign_f16(ptr addrspace(1) ; GFX11-LABEL: v_copysign_out_f32_mag_f32_sign_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 1, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1136,8 +1123,8 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f32_sign_f16(ptr addrspace(1) define amdgpu_kernel void @v_copysign_out_f64_mag_f64_sign_f16(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) { ; SI-LABEL: v_copysign_out_f64_mag_f64_sign_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: s_mov_b32 s15, s11 @@ -1163,8 +1150,8 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f64_sign_f16(ptr addrspace(1) ; ; VI-LABEL: v_copysign_out_f64_mag_f64_sign_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1188,14 +1175,14 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f64_sign_f16(ptr addrspace(1) ; ; GFX9-LABEL: v_copysign_out_f64_mag_f64_sign_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v2, v1, s[0:1] ; GFX9-NEXT: s_brev_b32 s0, -2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_ushort v2, v1, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -1207,12 +1194,10 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f64_sign_f16(ptr addrspace(1) ; GFX11-LABEL: v_copysign_out_f64_mag_f64_sign_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v2, v1, s[4:5] ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] @@ -1239,8 +1224,8 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f64_sign_f16(ptr addrspace(1) define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f32(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) { ; SI-LABEL: v_copysign_out_f16_mag_f16_sign_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: s_mov_b32 s15, s11 @@ -1266,8 +1251,8 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f32(ptr addrspace(1) ; ; VI-LABEL: v_copysign_out_f16_mag_f16_sign_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1291,14 +1276,14 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f32(ptr addrspace(1) ; ; GFX9-LABEL: v_copysign_out_f16_mag_f16_sign_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v1, s[0:1] ; GFX9-NEXT: s_movk_i32 s0, 0x7fff +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v1, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_ushort v0, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -1310,10 +1295,8 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f32(ptr addrspace(1) ; GFX11-LABEL: v_copysign_out_f16_mag_f16_sign_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 2, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1342,35 +1325,35 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f32(ptr addrspace(1) define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f64(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) { ; SI-LABEL: v_copysign_out_f16_mag_f16_sign_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s14, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s15, s3 ; SI-NEXT: buffer_load_ushort v2, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64 ; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 ; SI-NEXT: v_bfi_b32 v0, s0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_copysign_out_f16_mag_f16_sign_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 @@ -1392,15 +1375,15 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f64(ptr addrspace(1) ; ; GFX9-LABEL: v_copysign_out_f16_mag_f16_sign_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX9-NEXT: s_movk_i32 s0, 0x7fff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] -; GFX9-NEXT: s_movk_i32 s0, 0x7fff ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_bfi_b32 v1, s0, v2, v1 @@ -1410,12 +1393,10 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f64(ptr addrspace(1) ; GFX11-LABEL: v_copysign_out_f16_mag_f16_sign_f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[4:5] ; GFX11-NEXT: global_load_u16 v0, v2, s[2:3] @@ -1442,8 +1423,8 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f64(ptr addrspace(1) define amdgpu_kernel void @v_copysign_out_f16_mag_f32_sign_f16(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) { ; SI-LABEL: v_copysign_out_f16_mag_f32_sign_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: s_mov_b32 s15, s11 @@ -1471,8 +1452,8 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f32_sign_f16(ptr addrspace(1) ; ; VI-LABEL: v_copysign_out_f16_mag_f32_sign_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1496,17 +1477,17 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f32_sign_f16(ptr addrspace(1) ; ; GFX9-LABEL: v_copysign_out_f16_mag_f32_sign_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_movk_i32 s0, 0x7fff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v1, s[6:7] -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_ushort v0, v0, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX9-NEXT: global_load_ushort v0, v0, s[0:1] -; GFX9-NEXT: s_movk_i32 s0, 0x7fff ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_bfi_b32 v0, s0, v1, v0 ; GFX9-NEXT: global_store_short v2, v0, s[4:5] @@ -1515,10 +1496,8 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f32_sign_f16(ptr addrspace(1) ; GFX11-LABEL: v_copysign_out_f16_mag_f32_sign_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 2, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1547,8 +1526,8 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f32_sign_f16(ptr addrspace(1) define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1) %arg_out, double %mag, half %sign) { ; SI-LABEL: s_copysign_out_f16_mag_f64_sign_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 ; SI-NEXT: s_lshr_b32 s4, s3, 8 @@ -1611,8 +1590,8 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1) ; ; VI-LABEL: s_copysign_out_f16_mag_f64_sign_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s8, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s0, s7, 8 ; VI-NEXT: s_and_b32 s1, s7, 0x1ff @@ -1660,7 +1639,10 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1) ; VI-NEXT: v_or_b32_e32 v2, 0x7c00, v2 ; VI-NEXT: v_mov_b32_e32 v3, s2 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_lshr_b32 s0, s7, 16 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; VI-NEXT: s_and_b32 s0, s0, 0x8000 +; VI-NEXT: v_or_b32_e32 v2, s0, v2 ; VI-NEXT: s_movk_i32 s0, 0x7fff ; VI-NEXT: v_mov_b32_e32 v3, s8 ; VI-NEXT: v_bfi_b32 v2, s0, v2, v3 @@ -1669,8 +1651,8 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1) ; ; GFX9-LABEL: s_copysign_out_f16_mag_f64_sign_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s0, s7, 8 @@ -1691,33 +1673,36 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1) ; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s2 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, s0, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX9-NEXT: s_add_i32 s7, s1, 0xfffffc10 +; GFX9-NEXT: s_add_i32 s9, s1, 0xfffffc10 ; GFX9-NEXT: v_readfirstlane_b32 s0, v1 -; GFX9-NEXT: s_lshl_b32 s1, s7, 12 +; GFX9-NEXT: s_lshl_b32 s1, s9, 12 ; GFX9-NEXT: s_or_b32 s0, s2, s0 ; GFX9-NEXT: s_or_b32 s1, s6, s1 -; GFX9-NEXT: s_cmp_lt_i32 s7, 1 -; GFX9-NEXT: s_cselect_b32 s9, s0, s1 -; GFX9-NEXT: s_and_b32 s2, s9, 7 +; GFX9-NEXT: s_cmp_lt_i32 s9, 1 +; GFX9-NEXT: s_cselect_b32 s10, s0, s1 +; GFX9-NEXT: s_and_b32 s2, s10, 7 ; GFX9-NEXT: s_cmp_gt_i32 s2, 5 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s2, 3 ; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] -; GFX9-NEXT: s_lshr_b32 s2, s9, 2 +; GFX9-NEXT: s_lshr_b32 s2, s10, 2 ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_addc_u32 s0, s2, 0 -; GFX9-NEXT: s_cmp_lt_i32 s7, 31 +; GFX9-NEXT: s_cmp_lt_i32 s9, 31 ; GFX9-NEXT: s_cselect_b32 s2, s0, 0x7c00 ; GFX9-NEXT: s_cmp_lg_u32 s6, 0 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 9, v1 -; GFX9-NEXT: s_cmpk_eq_i32 s7, 0x40f +; GFX9-NEXT: s_cmpk_eq_i32 s9, 0x40f ; GFX9-NEXT: v_or_b32_e32 v1, 0x7c00, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX9-NEXT: s_lshr_b32 s0, s7, 16 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX9-NEXT: s_and_b32 s0, s0, 0x8000 +; GFX9-NEXT: v_or_b32_e32 v1, s0, v1 ; GFX9-NEXT: s_movk_i32 s0, 0x7fff ; GFX9-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-NEXT: v_bfi_b32 v1, s0, v1, v2 @@ -1727,8 +1712,8 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1) ; GFX11-LABEL: s_copysign_out_f16_mag_f64_sign_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s1, s7, 0x1ff ; GFX11-NEXT: s_lshr_b32 s2, s7, 8 @@ -1743,13 +1728,13 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1) ; GFX11-NEXT: s_addk_i32 s1, 0xfc10 ; GFX11-NEXT: v_med3_i32 v1, s3, 0, 13 ; GFX11-NEXT: v_readfirstlane_b32 s3, v0 -; GFX11-NEXT: s_lshl_b32 s7, s1, 12 +; GFX11-NEXT: s_lshl_b32 s8, s1, 12 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_readfirstlane_b32 s6, v1 ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: s_or_b32 s3, s2, 0x1000 -; GFX11-NEXT: s_or_b32 s7, s2, s7 +; GFX11-NEXT: s_or_b32 s8, s2, s8 ; GFX11-NEXT: s_lshr_b32 s6, s3, s6 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_lshlrev_b32_e64 v0, v1, s6 @@ -1760,15 +1745,15 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1) ; GFX11-NEXT: v_readfirstlane_b32 s3, v0 ; GFX11-NEXT: s_or_b32 s3, s6, s3 ; GFX11-NEXT: s_cmp_lt_i32 s1, 1 -; GFX11-NEXT: s_cselect_b32 s3, s3, s7 +; GFX11-NEXT: s_cselect_b32 s3, s3, s8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s6, s3, 7 ; GFX11-NEXT: s_cmp_gt_i32 s6, 5 -; GFX11-NEXT: s_cselect_b32 s7, -1, 0 +; GFX11-NEXT: s_cselect_b32 s8, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s6, 3 ; GFX11-NEXT: s_cselect_b32 s6, -1, 0 ; GFX11-NEXT: s_lshr_b32 s3, s3, 2 -; GFX11-NEXT: s_or_b32 s6, s6, s7 +; GFX11-NEXT: s_or_b32 s6, s6, s8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_cmp_lg_u32 s6, 0 ; GFX11-NEXT: s_addc_u32 s3, s3, 0 @@ -1779,11 +1764,15 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1) ; GFX11-NEXT: s_cmpk_eq_i32 s1, 0x40f ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_lshr_b32 s1, s7, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_and_b32 s1, s1, 0x8000 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 9, v0 -; GFX11-NEXT: v_or_b32_e32 v0, 0x7c00, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_or_b32_e32 v0, 0x7c00, v0 ; GFX11-NEXT: v_cndmask_b32_e32 v0, s3, v0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_or_b32_e32 v0, s1, v0 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, s0 ; GFX11-NEXT: global_store_b16 v1, v0, s[4:5] ; GFX11-NEXT: s_nop 0 @@ -1798,7 +1787,7 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1) define amdgpu_kernel void @s_copysign_v2f16(ptr addrspace(1) %arg_out, <2 x half> %arg_mag, <2 x half> %arg_sign) { ; SI-LABEL: s_copysign_v2f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1822,7 +1811,7 @@ define amdgpu_kernel void @s_copysign_v2f16(ptr addrspace(1) %arg_out, <2 x half ; ; VI-LABEL: s_copysign_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_movk_i32 s4, 0x7fff ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -1842,26 +1831,26 @@ define amdgpu_kernel void @s_copysign_v2f16(ptr addrspace(1) %arg_out, <2 x half ; ; GFX9-LABEL: s_copysign_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_movk_i32 s0, 0x7fff ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-NEXT: s_lshr_b32 s2, s2, 16 -; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: s_lshr_b32 s1, s7, 16 +; GFX9-NEXT: s_lshr_b32 s2, s6, 16 +; GFX9-NEXT: v_bfi_b32 v1, s0, v1, v2 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_bfi_b32 v2, s4, v2, v3 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_bfi_b32 v2, s0, v2, v3 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_copysign_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v0, s3 @@ -1887,8 +1876,8 @@ define amdgpu_kernel void @s_copysign_v2f16(ptr addrspace(1) %arg_out, <2 x half define amdgpu_kernel void @s_copysign_v3f16(ptr addrspace(1) %arg_out, <3 x half> %arg_mag, <3 x half> %arg_sign) { ; SI-LABEL: s_copysign_v3f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshr_b32 s2, s4, 16 @@ -1915,8 +1904,8 @@ define amdgpu_kernel void @s_copysign_v3f16(ptr addrspace(1) %arg_out, <3 x half ; ; VI-LABEL: s_copysign_v3f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_movk_i32 s2, 0x7fff ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -1944,33 +1933,33 @@ define amdgpu_kernel void @s_copysign_v3f16(ptr addrspace(1) %arg_out, <3 x half ; ; GFX9-LABEL: s_copysign_v3f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_movk_i32 s2, 0x7fff +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_movk_i32 s0, 0x7fff ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: s_lshr_b32 s3, s6, 16 +; GFX9-NEXT: s_lshr_b32 s1, s6, 16 ; GFX9-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NEXT: v_bfi_b32 v1, s2, v1, v2 +; GFX9-NEXT: v_bfi_b32 v1, s0, v1, v2 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_bfi_b32 v2, s2, v2, v3 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_bfi_b32 v2, s0, v2, v3 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: v_bfi_b32 v2, s2, v2, v3 -; GFX9-NEXT: global_store_short v0, v2, s[0:1] offset:4 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: v_bfi_b32 v2, s0, v2, v3 +; GFX9-NEXT: global_store_short v0, v2, s[2:3] offset:4 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_copysign_v3f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshr_b32 s2, s6, 16 @@ -1999,8 +1988,8 @@ define amdgpu_kernel void @s_copysign_v3f16(ptr addrspace(1) %arg_out, <3 x half define amdgpu_kernel void @s_copysign_v4f16(ptr addrspace(1) %arg_out, <4 x half> %arg_mag, <4 x half> %arg_sign) { ; SI-LABEL: s_copysign_v4f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2034,8 +2023,8 @@ define amdgpu_kernel void @s_copysign_v4f16(ptr addrspace(1) %arg_out, <4 x half ; ; VI-LABEL: s_copysign_v4f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_movk_i32 s2, 0x7fff ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s5 @@ -2065,39 +2054,39 @@ define amdgpu_kernel void @s_copysign_v4f16(ptr addrspace(1) %arg_out, <4 x half ; ; GFX9-LABEL: s_copysign_v4f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_movk_i32 s2, 0x7fff +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_movk_i32 s0, 0x7fff ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_lshr_b32 s3, s7, 16 +; GFX9-NEXT: s_lshr_b32 s1, s7, 16 ; GFX9-NEXT: s_lshr_b32 s5, s5, 16 -; GFX9-NEXT: v_bfi_b32 v0, s2, v0, v1 +; GFX9-NEXT: v_bfi_b32 v0, s0, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_bfi_b32 v1, s2, v1, v3 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_bfi_b32 v1, s0, v1, v3 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-NEXT: s_lshr_b32 s3, s6, 16 +; GFX9-NEXT: s_lshr_b32 s1, s6, 16 ; GFX9-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NEXT: v_bfi_b32 v0, s2, v0, v3 +; GFX9-NEXT: v_bfi_b32 v0, s0, v0, v3 ; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_bfi_b32 v3, s2, v3, v4 +; GFX9-NEXT: v_mov_b32_e32 v4, s1 +; GFX9-NEXT: v_bfi_b32 v3, s0, v3, v4 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_copysign_v4f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v0, s7 ; GFX11-NEXT: v_mov_b32_e32 v1, s6 diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll index 542d67486e758..f48961c905f58 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll @@ -6,7 +6,7 @@ define amdgpu_kernel void @s_test_copysign_f32(ptr addrspace(1) %out, float %mag, float %sign) { ; SI-LABEL: s_test_copysign_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_brev_b32 s8, -2 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 @@ -21,7 +21,7 @@ define amdgpu_kernel void @s_test_copysign_f32(ptr addrspace(1) %out, float %mag ; ; VI-LABEL: s_test_copysign_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_brev_b32 s4, -2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -34,7 +34,7 @@ define amdgpu_kernel void @s_test_copysign_f32(ptr addrspace(1) %out, float %mag ; ; GFX11-LABEL: s_test_copysign_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -51,8 +51,8 @@ define amdgpu_kernel void @s_test_copysign_f32(ptr addrspace(1) %out, float %mag define amdgpu_kernel void @s_test_copysign_f32_0(ptr addrspace(1) %out, float %mag) { ; SI-LABEL: s_test_copysign_f32_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -63,10 +63,10 @@ define amdgpu_kernel void @s_test_copysign_f32_0(ptr addrspace(1) %out, float %m ; ; VI-LABEL: s_test_copysign_f32_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 0x7fffffff +; VI-NEXT: s_bitset0_b32 s2, 31 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -76,10 +76,10 @@ define amdgpu_kernel void @s_test_copysign_f32_0(ptr addrspace(1) %out, float %m ; GFX11-LABEL: s_test_copysign_f32_0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0x7fffffff +; GFX11-NEXT: s_bitset0_b32 s2, 31 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -94,8 +94,8 @@ define amdgpu_kernel void @s_test_copysign_f32_0(ptr addrspace(1) %out, float %m define amdgpu_kernel void @s_test_copysign_f32_1(ptr addrspace(1) %out, float %mag) { ; SI-LABEL: s_test_copysign_f32_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -106,10 +106,10 @@ define amdgpu_kernel void @s_test_copysign_f32_1(ptr addrspace(1) %out, float %m ; ; VI-LABEL: s_test_copysign_f32_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 0x7fffffff +; VI-NEXT: s_bitset0_b32 s2, 31 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -119,10 +119,10 @@ define amdgpu_kernel void @s_test_copysign_f32_1(ptr addrspace(1) %out, float %m ; GFX11-LABEL: s_test_copysign_f32_1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0x7fffffff +; GFX11-NEXT: s_bitset0_b32 s2, 31 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -137,8 +137,8 @@ define amdgpu_kernel void @s_test_copysign_f32_1(ptr addrspace(1) %out, float %m define amdgpu_kernel void @s_test_copysign_f32_10.0(ptr addrspace(1) %out, float %mag) { ; SI-LABEL: s_test_copysign_f32_10.0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -149,10 +149,10 @@ define amdgpu_kernel void @s_test_copysign_f32_10.0(ptr addrspace(1) %out, float ; ; VI-LABEL: s_test_copysign_f32_10.0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 0x7fffffff +; VI-NEXT: s_bitset0_b32 s2, 31 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -162,10 +162,10 @@ define amdgpu_kernel void @s_test_copysign_f32_10.0(ptr addrspace(1) %out, float ; GFX11-LABEL: s_test_copysign_f32_10.0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0x7fffffff +; GFX11-NEXT: s_bitset0_b32 s2, 31 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -180,8 +180,8 @@ define amdgpu_kernel void @s_test_copysign_f32_10.0(ptr addrspace(1) %out, float define amdgpu_kernel void @s_test_copysign_f32_neg1(ptr addrspace(1) %out, float %mag) { ; SI-LABEL: s_test_copysign_f32_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -192,10 +192,10 @@ define amdgpu_kernel void @s_test_copysign_f32_neg1(ptr addrspace(1) %out, float ; ; VI-LABEL: s_test_copysign_f32_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_or_b32 s2, s4, 0x80000000 +; VI-NEXT: s_bitset1_b32 s2, 31 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -205,10 +205,10 @@ define amdgpu_kernel void @s_test_copysign_f32_neg1(ptr addrspace(1) %out, float ; GFX11-LABEL: s_test_copysign_f32_neg1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_or_b32 s2, s4, 0x80000000 +; GFX11-NEXT: s_bitset1_b32 s2, 31 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -223,8 +223,8 @@ define amdgpu_kernel void @s_test_copysign_f32_neg1(ptr addrspace(1) %out, float define amdgpu_kernel void @s_test_copysign_f32_neg10(ptr addrspace(1) %out, float %mag) { ; SI-LABEL: s_test_copysign_f32_neg10: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -235,10 +235,10 @@ define amdgpu_kernel void @s_test_copysign_f32_neg10(ptr addrspace(1) %out, floa ; ; VI-LABEL: s_test_copysign_f32_neg10: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_or_b32 s2, s4, 0x80000000 +; VI-NEXT: s_bitset1_b32 s2, 31 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -248,10 +248,10 @@ define amdgpu_kernel void @s_test_copysign_f32_neg10(ptr addrspace(1) %out, floa ; GFX11-LABEL: s_test_copysign_f32_neg10: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_or_b32 s2, s4, 0x80000000 +; GFX11-NEXT: s_bitset1_b32 s2, 31 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -266,8 +266,8 @@ define amdgpu_kernel void @s_test_copysign_f32_neg10(ptr addrspace(1) %out, floa define amdgpu_kernel void @s_test_copysign_f32_0_mag(ptr addrspace(1) %out, float %sign) { ; SI-LABEL: s_test_copysign_f32_0_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -278,10 +278,10 @@ define amdgpu_kernel void @s_test_copysign_f32_0_mag(ptr addrspace(1) %out, floa ; ; VI-LABEL: s_test_copysign_f32_0_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 0x80000000 +; VI-NEXT: s_and_b32 s2, s2, 0x80000000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -291,10 +291,10 @@ define amdgpu_kernel void @s_test_copysign_f32_0_mag(ptr addrspace(1) %out, floa ; GFX11-LABEL: s_test_copysign_f32_0_mag: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0x80000000 +; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -310,8 +310,8 @@ define amdgpu_kernel void @s_test_copysign_f32_0_mag(ptr addrspace(1) %out, floa define amdgpu_kernel void @s_test_copysign_f32_1_mag(ptr addrspace(1) %out, float %sign) { ; SI-LABEL: s_test_copysign_f32_1_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -323,10 +323,10 @@ define amdgpu_kernel void @s_test_copysign_f32_1_mag(ptr addrspace(1) %out, floa ; ; VI-LABEL: s_test_copysign_f32_1_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 0x80000000 +; VI-NEXT: s_and_b32 s2, s2, 0x80000000 ; VI-NEXT: s_or_b32 s2, s2, 1.0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -337,10 +337,10 @@ define amdgpu_kernel void @s_test_copysign_f32_1_mag(ptr addrspace(1) %out, floa ; GFX11-LABEL: s_test_copysign_f32_1_mag: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0x80000000 +; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s2, s2, 1.0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 @@ -356,8 +356,8 @@ define amdgpu_kernel void @s_test_copysign_f32_1_mag(ptr addrspace(1) %out, floa define amdgpu_kernel void @s_test_copysign_f32_10_mag(ptr addrspace(1) %out, float %sign) { ; SI-LABEL: s_test_copysign_f32_10_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -369,10 +369,10 @@ define amdgpu_kernel void @s_test_copysign_f32_10_mag(ptr addrspace(1) %out, flo ; ; VI-LABEL: s_test_copysign_f32_10_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 0x80000000 +; VI-NEXT: s_and_b32 s2, s2, 0x80000000 ; VI-NEXT: s_or_b32 s2, s2, 0x41200000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -383,10 +383,10 @@ define amdgpu_kernel void @s_test_copysign_f32_10_mag(ptr addrspace(1) %out, flo ; GFX11-LABEL: s_test_copysign_f32_10_mag: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0x80000000 +; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s2, s2, 0x41200000 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 @@ -402,8 +402,8 @@ define amdgpu_kernel void @s_test_copysign_f32_10_mag(ptr addrspace(1) %out, flo define amdgpu_kernel void @s_test_copysign_f32_neg1_mag(ptr addrspace(1) %out, float %sign) { ; SI-LABEL: s_test_copysign_f32_neg1_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -415,10 +415,10 @@ define amdgpu_kernel void @s_test_copysign_f32_neg1_mag(ptr addrspace(1) %out, f ; ; VI-LABEL: s_test_copysign_f32_neg1_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 0x80000000 +; VI-NEXT: s_and_b32 s2, s2, 0x80000000 ; VI-NEXT: s_or_b32 s2, s2, 1.0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -429,10 +429,10 @@ define amdgpu_kernel void @s_test_copysign_f32_neg1_mag(ptr addrspace(1) %out, f ; GFX11-LABEL: s_test_copysign_f32_neg1_mag: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0x80000000 +; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s2, s2, 1.0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 @@ -448,8 +448,8 @@ define amdgpu_kernel void @s_test_copysign_f32_neg1_mag(ptr addrspace(1) %out, f define amdgpu_kernel void @s_test_copysign_f32_neg10_mag(ptr addrspace(1) %out, float %sign) { ; SI-LABEL: s_test_copysign_f32_neg10_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -461,10 +461,10 @@ define amdgpu_kernel void @s_test_copysign_f32_neg10_mag(ptr addrspace(1) %out, ; ; VI-LABEL: s_test_copysign_f32_neg10_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 0x80000000 +; VI-NEXT: s_and_b32 s2, s2, 0x80000000 ; VI-NEXT: s_or_b32 s2, s2, 0x41200000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -475,10 +475,10 @@ define amdgpu_kernel void @s_test_copysign_f32_neg10_mag(ptr addrspace(1) %out, ; GFX11-LABEL: s_test_copysign_f32_neg10_mag: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0x80000000 +; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s2, s2, 0x41200000 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 @@ -494,8 +494,8 @@ define amdgpu_kernel void @s_test_copysign_f32_neg10_mag(ptr addrspace(1) %out, define amdgpu_kernel void @s_test_copysign_v2f32(ptr addrspace(1) %out, <2 x float> %mag, <2 x float> %sign) { ; SI-LABEL: s_test_copysign_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_brev_b32 s8, -2 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -511,8 +511,8 @@ define amdgpu_kernel void @s_test_copysign_v2f32(ptr addrspace(1) %out, <2 x flo ; ; VI-LABEL: s_test_copysign_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_brev_b32 s2, -2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s5 @@ -529,8 +529,8 @@ define amdgpu_kernel void @s_test_copysign_v2f32(ptr addrspace(1) %out, <2 x flo ; GFX11-LABEL: s_test_copysign_v2f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s7 ; GFX11-NEXT: v_mov_b32_e32 v2, s6 @@ -549,40 +549,40 @@ define amdgpu_kernel void @s_test_copysign_v2f32(ptr addrspace(1) %out, <2 x flo define amdgpu_kernel void @s_test_copysign_v3f32(ptr addrspace(1) %out, <3 x float> %mag, <3 x float> %sign) { ; SI-LABEL: s_test_copysign_v3f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 -; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: s_mov_b32 s15, 0xf000 -; SI-NEXT: s_mov_b32 s14, -1 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_brev_b32 s7, -2 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s5 ; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: v_bfi_b32 v1, s0, v0, v1 +; SI-NEXT: v_bfi_b32 v1, s7, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v2, s8 -; SI-NEXT: v_bfi_b32 v0, s0, v0, v2 +; SI-NEXT: v_bfi_b32 v0, s7, v0, v2 ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s10 -; SI-NEXT: v_bfi_b32 v2, s0, v2, v3 -; SI-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:8 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[12:15], 0 +; SI-NEXT: v_bfi_b32 v2, s7, v2, v3 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:8 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_v3f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_brev_b32 s2, -2 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_brev_b32 s7, -2 ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s10 ; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_bfi_b32 v2, s7, v0, v1 +; VI-NEXT: v_bfi_b32 v2, s2, v0, v1 ; VI-NEXT: v_mov_b32_e32 v0, s9 -; VI-NEXT: v_bfi_b32 v1, s7, v3, v0 +; VI-NEXT: v_bfi_b32 v1, s2, v3, v0 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v3, s8 -; VI-NEXT: v_bfi_b32 v0, s7, v0, v3 +; VI-NEXT: v_bfi_b32 v0, s2, v0, v3 ; VI-NEXT: v_mov_b32_e32 v4, s1 ; VI-NEXT: v_mov_b32_e32 v3, s0 ; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2] @@ -591,8 +591,8 @@ define amdgpu_kernel void @s_test_copysign_v3f32(ptr addrspace(1) %out, <3 x flo ; GFX11-LABEL: s_test_copysign_v3f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s10 :: v_dual_mov_b32 v1, s9 @@ -614,45 +614,45 @@ define amdgpu_kernel void @s_test_copysign_v3f32(ptr addrspace(1) %out, <3 x flo define amdgpu_kernel void @s_test_copysign_v4f32(ptr addrspace(1) %out, <4 x float> %mag, <4 x float> %sign) { ; SI-LABEL: s_test_copysign_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd -; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s15, 0xf000 -; SI-NEXT: s_mov_b32 s14, -1 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd +; SI-NEXT: s_brev_b32 s12, -2 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s7 ; SI-NEXT: v_mov_b32_e32 v1, s11 -; SI-NEXT: v_bfi_b32 v3, s0, v0, v1 +; SI-NEXT: v_bfi_b32 v3, s12, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: v_mov_b32_e32 v1, s10 -; SI-NEXT: v_bfi_b32 v2, s0, v0, v1 +; SI-NEXT: v_bfi_b32 v2, s12, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s5 ; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: v_bfi_b32 v1, s0, v0, v1 +; SI-NEXT: v_bfi_b32 v1, s12, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_bfi_b32 v0, s0, v0, v4 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; SI-NEXT: v_bfi_b32 v0, s12, v0, v4 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_brev_b32 s12, -2 +; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_brev_b32 s2, -2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s7 ; VI-NEXT: v_mov_b32_e32 v1, s11 ; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_bfi_b32 v3, s12, v0, v1 +; VI-NEXT: v_bfi_b32 v3, s2, v0, v1 ; VI-NEXT: v_mov_b32_e32 v0, s10 -; VI-NEXT: v_bfi_b32 v2, s12, v2, v0 +; VI-NEXT: v_bfi_b32 v2, s2, v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s5 ; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: v_bfi_b32 v1, s12, v0, v1 +; VI-NEXT: v_bfi_b32 v1, s2, v0, v1 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v4, s8 -; VI-NEXT: v_bfi_b32 v0, s12, v0, v4 +; VI-NEXT: v_bfi_b32 v0, s2, v0, v4 ; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -661,8 +661,8 @@ define amdgpu_kernel void @s_test_copysign_v4f32(ptr addrspace(1) %out, <4 x flo ; GFX11-LABEL: s_test_copysign_v4f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v6, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s11 :: v_dual_mov_b32 v1, s10 @@ -906,46 +906,46 @@ define <5 x float> @v_test_copysign_v5f32(<5 x float> %mag, <5 x float> %sign) { define amdgpu_kernel void @s_test_copysign_f32_fptrunc_f64(ptr addrspace(1) %out, float %mag, double %sign) { ; SI-LABEL: s_test_copysign_f32_fptrunc_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_brev_b32 s0, -2 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_bfi_b32 v0, s0, v0, v1 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_f32_fptrunc_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_brev_b32 s0, -2 +; VI-NEXT: s_brev_b32 s2, -2 ; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_bfi_b32 v2, s0, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_bfi_b32 v2, s2, v0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f32_fptrunc_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s0, v0 -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s2, v0 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -958,7 +958,7 @@ define amdgpu_kernel void @s_test_copysign_f32_fptrunc_f64(ptr addrspace(1) %out define amdgpu_kernel void @s_test_copysign_f32_1_fptrunc_f64(ptr addrspace(1) %out, double %sign) { ; SI-LABEL: s_test_copysign_f32_1_fptrunc_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -972,7 +972,7 @@ define amdgpu_kernel void @s_test_copysign_f32_1_fptrunc_f64(ptr addrspace(1) %o ; ; VI-LABEL: s_test_copysign_f32_1_fptrunc_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_and_b32 s0, s3, 0x80000000 @@ -984,7 +984,7 @@ define amdgpu_kernel void @s_test_copysign_f32_1_fptrunc_f64(ptr addrspace(1) %o ; ; GFX11-LABEL: s_test_copysign_f32_1_fptrunc_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s2, s3, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) @@ -1003,7 +1003,7 @@ define amdgpu_kernel void @s_test_copysign_f32_1_fptrunc_f64(ptr addrspace(1) %o define amdgpu_kernel void @s_test_copysign_f32_fpext_f16(ptr addrspace(1) %out, float %mag, half %sign) { ; SI-LABEL: s_test_copysign_f32_fpext_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1018,7 +1018,7 @@ define amdgpu_kernel void @s_test_copysign_f32_fpext_f16(ptr addrspace(1) %out, ; ; VI-LABEL: s_test_copysign_f32_fpext_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_brev_b32 s4, -2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_lshlrev_b32_e64 v0, 16, s3 @@ -1031,7 +1031,7 @@ define amdgpu_kernel void @s_test_copysign_f32_fpext_f16(ptr addrspace(1) %out, ; ; GFX11-LABEL: s_test_copysign_f32_fpext_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s3 @@ -1050,24 +1050,23 @@ define amdgpu_kernel void @s_test_copysign_f32_fpext_f16(ptr addrspace(1) %out, define amdgpu_kernel void @s_test_copysign_f32_1_fpext_f16(ptr addrspace(1) %out, half %sign) { ; SI-LABEL: s_test_copysign_f32_1_fpext_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_and_b32_e32 v0, 0x80000000, v0 ; SI-NEXT: v_or_b32_e32 v0, 1.0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_f32_1_fpext_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s2, s4, 16 +; VI-NEXT: s_lshl_b32 s2, s2, 16 ; VI-NEXT: s_and_b32 s2, s2, 0x80000000 ; VI-NEXT: s_or_b32 s2, s2, 1.0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1079,10 +1078,10 @@ define amdgpu_kernel void @s_test_copysign_f32_1_fpext_f16(ptr addrspace(1) %out ; GFX11-LABEL: s_test_copysign_f32_1_fpext_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshl_b32 s2, s4, 16 +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000 ; GFX11-NEXT: s_or_b32 s2, s2, 1.0 @@ -1101,7 +1100,7 @@ define amdgpu_kernel void @s_test_copysign_f32_1_fpext_f16(ptr addrspace(1) %out define amdgpu_kernel void @s_test_copysign_f32_fpext_bf16(ptr addrspace(1) %out, float %mag, bfloat %sign) { ; SI-LABEL: s_test_copysign_f32_fpext_bf16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1117,7 +1116,7 @@ define amdgpu_kernel void @s_test_copysign_f32_fpext_bf16(ptr addrspace(1) %out, ; ; VI-LABEL: s_test_copysign_f32_fpext_bf16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_brev_b32 s4, -2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_lshlrev_b32_e64 v0, 16, s3 @@ -1130,7 +1129,7 @@ define amdgpu_kernel void @s_test_copysign_f32_fpext_bf16(ptr addrspace(1) %out, ; ; GFX11-LABEL: s_test_copysign_f32_fpext_bf16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s3 diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.ll b/llvm/test/CodeGen/AMDGPU/fdiv.ll index c6b730e3fd5d6..6c5b2917855fc 100644 --- a/llvm/test/CodeGen/AMDGPU/fdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv.ll @@ -16,7 +16,7 @@ define amdgpu_kernel void @s_fdiv_f32_ninf(ptr addrspace(1) %out, float %a, float %b) #0 { ; GFX6-FASTFMA-LABEL: s_fdiv_f32_ninf: ; GFX6-FASTFMA: ; %bb.0: ; %entry -; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-FASTFMA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-FASTFMA-NEXT: s_mov_b32 s6, -1 ; GFX6-FASTFMA-NEXT: s_waitcnt lgkmcnt(0) @@ -42,7 +42,7 @@ define amdgpu_kernel void @s_fdiv_f32_ninf(ptr addrspace(1) %out, float %a, floa ; ; GFX6-SLOWFMA-LABEL: s_fdiv_f32_ninf: ; GFX6-SLOWFMA: ; %bb.0: ; %entry -; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s6, -1 ; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) @@ -68,7 +68,7 @@ define amdgpu_kernel void @s_fdiv_f32_ninf(ptr addrspace(1) %out, float %a, floa ; ; GFX7-LABEL: s_fdiv_f32_ninf: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -94,7 +94,7 @@ define amdgpu_kernel void @s_fdiv_f32_ninf(ptr addrspace(1) %out, float %a, floa ; ; GFX8-LABEL: s_fdiv_f32_ninf: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_div_scale_f32 v1, s[4:5], s3, s3, v0 @@ -118,10 +118,10 @@ define amdgpu_kernel void @s_fdiv_f32_ninf(ptr addrspace(1) %out, float %a, floa ; ; GFX10-LABEL: s_fdiv_f32_ninf: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s4, s3, s3, s2 -; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s2, s3, s2 +; GFX10-NEXT: v_div_scale_f32 v0, s0, s7, s7, s6 +; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s6, s7, s6 ; GFX10-NEXT: v_rcp_f32_e32 v1, v0 ; GFX10-NEXT: s_denorm_mode 15 ; GFX10-NEXT: v_fma_f32 v3, -v0, v1, 1.0 @@ -133,13 +133,13 @@ define amdgpu_kernel void @s_fdiv_f32_ninf(ptr addrspace(1) %out, float %a, floa ; GFX10-NEXT: s_denorm_mode 12 ; GFX10-NEXT: v_div_fmas_f32 v0, v0, v1, v3 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_div_fixup_f32 v0, v0, s3, s2 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_div_fixup_f32 v0, v0, s7, s6 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_f32_ninf: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v0, null, s3, s3, s2 ; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, s2, s3, s2 @@ -181,7 +181,7 @@ entry: define amdgpu_kernel void @s_fdiv_f32_ieee(ptr addrspace(1) %out, float %a, float %b) #1 { ; GFX6-FASTFMA-LABEL: s_fdiv_f32_ieee: ; GFX6-FASTFMA: ; %bb.0: ; %entry -; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-FASTFMA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-FASTFMA-NEXT: s_mov_b32 s6, -1 ; GFX6-FASTFMA-NEXT: s_waitcnt lgkmcnt(0) @@ -205,7 +205,7 @@ define amdgpu_kernel void @s_fdiv_f32_ieee(ptr addrspace(1) %out, float %a, floa ; ; GFX6-SLOWFMA-LABEL: s_fdiv_f32_ieee: ; GFX6-SLOWFMA: ; %bb.0: ; %entry -; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s6, -1 ; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) @@ -229,7 +229,7 @@ define amdgpu_kernel void @s_fdiv_f32_ieee(ptr addrspace(1) %out, float %a, floa ; ; GFX7-LABEL: s_fdiv_f32_ieee: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -253,7 +253,7 @@ define amdgpu_kernel void @s_fdiv_f32_ieee(ptr addrspace(1) %out, float %a, floa ; ; GFX8-LABEL: s_fdiv_f32_ieee: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_div_scale_f32 v1, s[4:5], s3, s3, v0 @@ -275,26 +275,26 @@ define amdgpu_kernel void @s_fdiv_f32_ieee(ptr addrspace(1) %out, float %a, floa ; ; GFX10-LABEL: s_fdiv_f32_ieee: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s4, s3, s3, s2 +; GFX10-NEXT: v_div_scale_f32 v0, s0, s7, s7, s6 ; GFX10-NEXT: v_rcp_f32_e32 v1, v0 ; GFX10-NEXT: v_fma_f32 v2, -v0, v1, 1.0 ; GFX10-NEXT: v_fmac_f32_e32 v1, v2, v1 -; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s2, s3, s2 +; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s6, s7, s6 ; GFX10-NEXT: v_mul_f32_e32 v3, v2, v1 ; GFX10-NEXT: v_fma_f32 v4, -v0, v3, v2 ; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v1 ; GFX10-NEXT: v_fma_f32 v0, -v0, v3, v2 ; GFX10-NEXT: v_div_fmas_f32 v0, v0, v1, v3 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_div_fixup_f32 v0, v0, s3, s2 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_div_fixup_f32 v0, v0, s7, s6 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_f32_ieee: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v0, null, s3, s3, s2 ; GFX11-NEXT: v_rcp_f32_e32 v1, v0 @@ -334,7 +334,7 @@ entry: define amdgpu_kernel void @s_fdiv_25ulp_f32(ptr addrspace(1) %out, float %a, float %b) #0 { ; GFX67-LABEL: s_fdiv_25ulp_f32: ; GFX67: ; %bb.0: ; %entry -; GFX67-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX67-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX67-NEXT: v_mov_b32_e32 v0, 0x6f800000 ; GFX67-NEXT: v_mov_b32_e32 v1, 0x2f800000 ; GFX67-NEXT: s_mov_b32 s7, 0xf000 @@ -353,7 +353,7 @@ define amdgpu_kernel void @s_fdiv_25ulp_f32(ptr addrspace(1) %out, float %a, flo ; ; GFX8-LABEL: s_fdiv_25ulp_f32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v0, 0x6f800000 ; GFX8-NEXT: v_mov_b32_e32 v1, 0x2f800000 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -370,21 +370,21 @@ define amdgpu_kernel void @s_fdiv_25ulp_f32(ptr addrspace(1) %out, float %a, flo ; ; GFX10-LABEL: s_fdiv_25ulp_f32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cmp_lt_f32_e64 s4, 0x6f800000, |s3| -; GFX10-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x2f800000, s4 -; GFX10-NEXT: v_mul_f32_e32 v1, s3, v0 +; GFX10-NEXT: v_cmp_lt_f32_e64 s0, 0x6f800000, |s7| +; GFX10-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x2f800000, s0 +; GFX10-NEXT: v_mul_f32_e32 v1, s7, v0 ; GFX10-NEXT: v_rcp_f32_e32 v1, v1 -; GFX10-NEXT: v_mul_f32_e32 v1, s2, v1 +; GFX10-NEXT: v_mul_f32_e32 v1, s6, v1 ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX10-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-NEXT: global_store_dword v2, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_25ulp_f32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_lt_f32_e64 s4, 0x6f800000, |s3| @@ -420,7 +420,7 @@ entry: define amdgpu_kernel void @s_fdiv_25ulp_ieee_f32(ptr addrspace(1) %out, float %a, float %b) #1 { ; GFX6-LABEL: s_fdiv_25ulp_ieee_f32: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v0, 0x7f800000 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 @@ -446,7 +446,7 @@ define amdgpu_kernel void @s_fdiv_25ulp_ieee_f32(ptr addrspace(1) %out, float %a ; ; GFX7-LABEL: s_fdiv_25ulp_ieee_f32: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -465,7 +465,7 @@ define amdgpu_kernel void @s_fdiv_25ulp_ieee_f32(ptr addrspace(1) %out, float %a ; ; GFX8-LABEL: s_fdiv_25ulp_ieee_f32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_frexp_mant_f32_e32 v1, s3 ; GFX8-NEXT: v_rcp_f32_e32 v1, v1 @@ -482,23 +482,23 @@ define amdgpu_kernel void @s_fdiv_25ulp_ieee_f32(ptr addrspace(1) %out, float %a ; ; GFX10-LABEL: s_fdiv_25ulp_ieee_f32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_frexp_mant_f32_e32 v0, s3 -; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v1, s3 -; GFX10-NEXT: v_frexp_mant_f32_e32 v2, s2 -; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v3, s2 +; GFX10-NEXT: v_frexp_mant_f32_e32 v0, s7 +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v1, s7 +; GFX10-NEXT: v_frexp_mant_f32_e32 v2, s6 +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v3, s6 ; GFX10-NEXT: v_rcp_f32_e32 v0, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, v3, v1 ; GFX10-NEXT: v_mul_f32_e32 v0, v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX10-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-NEXT: global_store_dword v2, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_25ulp_ieee_f32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_frexp_mant_f32_e32 v0, s3 ; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v1, s3 @@ -535,7 +535,7 @@ entry: define amdgpu_kernel void @s_fdiv_fast_ieee_f32(ptr addrspace(1) %out, float %a, float %b) #1 { ; GFX67-LABEL: s_fdiv_fast_ieee_f32: ; GFX67: ; %bb.0: ; %entry -; GFX67-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX67-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX67-NEXT: s_mov_b32 s7, 0xf000 ; GFX67-NEXT: s_mov_b32 s6, -1 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) @@ -548,7 +548,7 @@ define amdgpu_kernel void @s_fdiv_fast_ieee_f32(ptr addrspace(1) %out, float %a, ; ; GFX8-LABEL: s_fdiv_fast_ieee_f32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_rcp_f32_e32 v0, s3 ; GFX8-NEXT: v_mul_f32_e32 v2, s2, v0 @@ -559,17 +559,17 @@ define amdgpu_kernel void @s_fdiv_fast_ieee_f32(ptr addrspace(1) %out, float %a, ; ; GFX10-LABEL: s_fdiv_fast_ieee_f32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_rcp_f32_e32 v0, s3 -; GFX10-NEXT: v_mul_f32_e32 v0, s2, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_rcp_f32_e32 v0, s7 +; GFX10-NEXT: v_mul_f32_e32 v0, s6, v0 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_fast_ieee_f32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rcp_f32_e32 v0, s3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff @@ -599,7 +599,7 @@ entry: define amdgpu_kernel void @s_fdiv_f32_fast_math(ptr addrspace(1) %out, float %a, float %b) #0 { ; GFX67-LABEL: s_fdiv_f32_fast_math: ; GFX67: ; %bb.0: ; %entry -; GFX67-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX67-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX67-NEXT: s_mov_b32 s7, 0xf000 ; GFX67-NEXT: s_mov_b32 s6, -1 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) @@ -612,7 +612,7 @@ define amdgpu_kernel void @s_fdiv_f32_fast_math(ptr addrspace(1) %out, float %a, ; ; GFX8-LABEL: s_fdiv_f32_fast_math: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_rcp_f32_e32 v0, s3 ; GFX8-NEXT: v_mul_f32_e32 v2, s2, v0 @@ -623,17 +623,17 @@ define amdgpu_kernel void @s_fdiv_f32_fast_math(ptr addrspace(1) %out, float %a, ; ; GFX10-LABEL: s_fdiv_f32_fast_math: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_rcp_f32_e32 v0, s3 -; GFX10-NEXT: v_mul_f32_e32 v0, s2, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_rcp_f32_e32 v0, s7 +; GFX10-NEXT: v_mul_f32_e32 v0, s6, v0 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_f32_fast_math: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rcp_f32_e32 v0, s3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff @@ -663,7 +663,7 @@ entry: define amdgpu_kernel void @s_fdiv_ulp25_f32_fast_math(ptr addrspace(1) %out, float %a, float %b) #0 { ; GFX67-LABEL: s_fdiv_ulp25_f32_fast_math: ; GFX67: ; %bb.0: ; %entry -; GFX67-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX67-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX67-NEXT: s_mov_b32 s7, 0xf000 ; GFX67-NEXT: s_mov_b32 s6, -1 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) @@ -676,7 +676,7 @@ define amdgpu_kernel void @s_fdiv_ulp25_f32_fast_math(ptr addrspace(1) %out, flo ; ; GFX8-LABEL: s_fdiv_ulp25_f32_fast_math: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_rcp_f32_e32 v0, s3 ; GFX8-NEXT: v_mul_f32_e32 v2, s2, v0 @@ -687,17 +687,17 @@ define amdgpu_kernel void @s_fdiv_ulp25_f32_fast_math(ptr addrspace(1) %out, flo ; ; GFX10-LABEL: s_fdiv_ulp25_f32_fast_math: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_rcp_f32_e32 v0, s3 -; GFX10-NEXT: v_mul_f32_e32 v0, s2, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_rcp_f32_e32 v0, s7 +; GFX10-NEXT: v_mul_f32_e32 v0, s6, v0 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_ulp25_f32_fast_math: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rcp_f32_e32 v0, s3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff @@ -727,7 +727,7 @@ entry: define amdgpu_kernel void @s_fdiv_f32_arcp_daz(ptr addrspace(1) %out, float %a, float %b) #0 { ; GFX6-FASTFMA-LABEL: s_fdiv_f32_arcp_daz: ; GFX6-FASTFMA: ; %bb.0: ; %entry -; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-FASTFMA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-FASTFMA-NEXT: s_mov_b32 s6, -1 ; GFX6-FASTFMA-NEXT: s_waitcnt lgkmcnt(0) @@ -753,7 +753,7 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_daz(ptr addrspace(1) %out, float %a, ; ; GFX6-SLOWFMA-LABEL: s_fdiv_f32_arcp_daz: ; GFX6-SLOWFMA: ; %bb.0: ; %entry -; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s6, -1 ; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) @@ -779,7 +779,7 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_daz(ptr addrspace(1) %out, float %a, ; ; GFX7-LABEL: s_fdiv_f32_arcp_daz: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -805,7 +805,7 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_daz(ptr addrspace(1) %out, float %a, ; ; GFX8-LABEL: s_fdiv_f32_arcp_daz: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_div_scale_f32 v1, s[4:5], s3, s3, v0 @@ -829,10 +829,10 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_daz(ptr addrspace(1) %out, float %a, ; ; GFX10-LABEL: s_fdiv_f32_arcp_daz: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s4, s3, s3, s2 -; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s2, s3, s2 +; GFX10-NEXT: v_div_scale_f32 v0, s0, s7, s7, s6 +; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s6, s7, s6 ; GFX10-NEXT: v_rcp_f32_e32 v1, v0 ; GFX10-NEXT: s_denorm_mode 15 ; GFX10-NEXT: v_fma_f32 v3, -v0, v1, 1.0 @@ -844,13 +844,13 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_daz(ptr addrspace(1) %out, float %a, ; GFX10-NEXT: s_denorm_mode 12 ; GFX10-NEXT: v_div_fmas_f32 v0, v0, v1, v3 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_div_fixup_f32 v0, v0, s3, s2 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_div_fixup_f32 v0, v0, s7, s6 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_f32_arcp_daz: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v0, null, s3, s3, s2 ; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, s2, s3, s2 @@ -892,7 +892,7 @@ entry: define amdgpu_kernel void @s_fdiv_f32_arcp_ninf(ptr addrspace(1) %out, float %a, float %b) #0 { ; GFX67-LABEL: s_fdiv_f32_arcp_ninf: ; GFX67: ; %bb.0: ; %entry -; GFX67-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX67-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX67-NEXT: s_mov_b32 s7, 0xf000 ; GFX67-NEXT: s_mov_b32 s6, -1 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) @@ -905,7 +905,7 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_ninf(ptr addrspace(1) %out, float %a, ; ; GFX8-LABEL: s_fdiv_f32_arcp_ninf: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_rcp_f32_e32 v0, s3 ; GFX8-NEXT: v_mul_f32_e32 v2, s2, v0 @@ -916,17 +916,17 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_ninf(ptr addrspace(1) %out, float %a, ; ; GFX10-LABEL: s_fdiv_f32_arcp_ninf: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_rcp_f32_e32 v0, s3 -; GFX10-NEXT: v_mul_f32_e32 v0, s2, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_rcp_f32_e32 v0, s7 +; GFX10-NEXT: v_mul_f32_e32 v0, s6, v0 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_f32_arcp_ninf: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rcp_f32_e32 v0, s3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff @@ -956,8 +956,8 @@ entry: define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <2 x float> %b) #0 { ; GFX6-FASTFMA-LABEL: s_fdiv_v2f32: ; GFX6-FASTFMA: ; %bb.0: ; %entry -; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX6-FASTFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GFX6-FASTFMA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-FASTFMA-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-FASTFMA-NEXT: s_mov_b32 s2, -1 ; GFX6-FASTFMA-NEXT: s_waitcnt lgkmcnt(0) @@ -996,10 +996,11 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; ; GFX6-SLOWFMA-LABEL: s_fdiv_v2f32: ; GFX6-SLOWFMA: ; %bb.0: ; %entry -; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GFX6-SLOWFMA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v0, s5 -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v1, s[0:1], s7, s7, v0 +; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v1, s[2:3], s7, s7, v0 ; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v2, s7 ; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, vcc, s5, v2, s5 ; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v4, s4 @@ -1012,11 +1013,10 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v1, -v1, v5, v2 ; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, s[0:1], s6, s6, v4 +; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, s[2:3], s6, s6, v4 ; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v1, v1, v3, v5 ; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v3, s6 ; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v3, vcc, s4, v3, s4 -; GFX6-SLOWFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s2, -1 ; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v5, v2 @@ -1031,14 +1031,13 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v0, v2, v0, v5 ; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v0, s6, v4 -; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-SLOWFMA-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-SLOWFMA-NEXT: s_endpgm ; ; GFX7-LABEL: s_fdiv_v2f32: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1077,10 +1076,11 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; ; GFX8-LABEL: s_fdiv_v2f32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s5 -; GFX8-NEXT: v_div_scale_f32 v1, s[0:1], s7, s7, v0 +; GFX8-NEXT: v_div_scale_f32 v1, s[2:3], s7, s7, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, s7 ; GFX8-NEXT: v_div_scale_f32 v2, vcc, s5, v2, s5 ; GFX8-NEXT: v_mov_b32_e32 v4, s4 @@ -1093,11 +1093,10 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX8-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX8-NEXT: v_fma_f32 v1, -v1, v5, v2 ; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], s6, s6, v4 +; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], s6, s6, v4 ; GFX8-NEXT: v_div_fmas_f32 v1, v1, v3, v5 ; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: v_div_scale_f32 v3, vcc, s4, v3, s4 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_rcp_f32_e32 v5, v2 ; GFX8-NEXT: v_div_fixup_f32 v1, v1, s7, v0 ; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -1109,7 +1108,6 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX8-NEXT: v_fma_f32 v2, -v2, v5, v3 ; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX8-NEXT: v_div_fmas_f32 v0, v2, v0, v5 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: v_div_fixup_f32 v0, v0, s6, v4 @@ -1118,10 +1116,11 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; ; GFX10-LABEL: s_fdiv_v2f32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s0, s7, s7, s5 +; GFX10-NEXT: v_div_scale_f32 v0, s2, s7, s7, s5 ; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s5, s7, s5 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_rcp_f32_e32 v1, v0 ; GFX10-NEXT: s_denorm_mode 15 ; GFX10-NEXT: v_fma_f32 v3, -v0, v1, 1.0 @@ -1131,9 +1130,8 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v1 ; GFX10-NEXT: v_fma_f32 v0, -v0, v3, v2 ; GFX10-NEXT: s_denorm_mode 12 -; GFX10-NEXT: v_div_scale_f32 v2, s0, s6, s6, s4 +; GFX10-NEXT: v_div_scale_f32 v2, s2, s6, s6, s4 ; GFX10-NEXT: v_div_fmas_f32 v0, v0, v1, v3 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_rcp_f32_e32 v3, v2 ; GFX10-NEXT: v_div_fixup_f32 v1, v0, s7, s5 ; GFX10-NEXT: v_div_scale_f32 v0, vcc_lo, s4, s6, s4 @@ -1155,8 +1153,8 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX11-LABEL: s_fdiv_v2f32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v0, null, s7, s7, s5 ; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, s5, s7, s5 @@ -1214,8 +1212,8 @@ entry: define amdgpu_kernel void @s_fdiv_ulp25_v2f32(ptr addrspace(1) %out, <2 x float> %a, <2 x float> %b) #0 { ; GFX67-LABEL: s_fdiv_ulp25_v2f32: ; GFX67: ; %bb.0: ; %entry -; GFX67-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX67-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX67-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GFX67-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX67-NEXT: s_mov_b32 s3, 0xf000 ; GFX67-NEXT: s_mov_b32 s2, -1 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) @@ -1228,8 +1226,8 @@ define amdgpu_kernel void @s_fdiv_ulp25_v2f32(ptr addrspace(1) %out, <2 x float> ; ; GFX8-LABEL: s_fdiv_ulp25_v2f32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_rcp_f32_e32 v0, s6 ; GFX8-NEXT: v_rcp_f32_e32 v1, s7 @@ -1243,22 +1241,22 @@ define amdgpu_kernel void @s_fdiv_ulp25_v2f32(ptr addrspace(1) %out, <2 x float> ; GFX10-LABEL: s_fdiv_ulp25_v2f32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_rcp_f32_e32 v0, s6 ; GFX10-NEXT: v_rcp_f32_e32 v1, s7 ; GFX10-NEXT: v_mul_f32_e32 v0, s4, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, s5, v1 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_ulp25_v2f32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rcp_f32_e32 v0, s6 ; GFX11-NEXT: v_rcp_f32_e32 v1, s7 @@ -1292,8 +1290,8 @@ entry: define amdgpu_kernel void @s_fdiv_v2f32_fast_math(ptr addrspace(1) %out, <2 x float> %a, <2 x float> %b) #0 { ; GFX67-LABEL: s_fdiv_v2f32_fast_math: ; GFX67: ; %bb.0: ; %entry -; GFX67-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX67-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX67-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GFX67-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX67-NEXT: s_mov_b32 s3, 0xf000 ; GFX67-NEXT: s_mov_b32 s2, -1 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) @@ -1306,8 +1304,8 @@ define amdgpu_kernel void @s_fdiv_v2f32_fast_math(ptr addrspace(1) %out, <2 x fl ; ; GFX8-LABEL: s_fdiv_v2f32_fast_math: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_rcp_f32_e32 v0, s7 ; GFX8-NEXT: v_rcp_f32_e32 v2, s6 @@ -1321,22 +1319,22 @@ define amdgpu_kernel void @s_fdiv_v2f32_fast_math(ptr addrspace(1) %out, <2 x fl ; GFX10-LABEL: s_fdiv_v2f32_fast_math: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_rcp_f32_e32 v0, s7 ; GFX10-NEXT: v_rcp_f32_e32 v2, s6 ; GFX10-NEXT: v_mul_f32_e32 v1, s5, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, s4, v2 -; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_v2f32_fast_math: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rcp_f32_e32 v0, s7 ; GFX11-NEXT: v_rcp_f32_e32 v2, s6 @@ -1370,8 +1368,8 @@ entry: define amdgpu_kernel void @s_fdiv_v2f32_arcp_math(ptr addrspace(1) %out, <2 x float> %a, <2 x float> %b) #0 { ; GFX67-LABEL: s_fdiv_v2f32_arcp_math: ; GFX67: ; %bb.0: ; %entry -; GFX67-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX67-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX67-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GFX67-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX67-NEXT: s_mov_b32 s3, 0xf000 ; GFX67-NEXT: s_mov_b32 s2, -1 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) @@ -1384,8 +1382,8 @@ define amdgpu_kernel void @s_fdiv_v2f32_arcp_math(ptr addrspace(1) %out, <2 x fl ; ; GFX8-LABEL: s_fdiv_v2f32_arcp_math: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_rcp_f32_e32 v0, s7 ; GFX8-NEXT: v_rcp_f32_e32 v2, s6 @@ -1399,22 +1397,22 @@ define amdgpu_kernel void @s_fdiv_v2f32_arcp_math(ptr addrspace(1) %out, <2 x fl ; GFX10-LABEL: s_fdiv_v2f32_arcp_math: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_rcp_f32_e32 v0, s7 ; GFX10-NEXT: v_rcp_f32_e32 v2, s6 ; GFX10-NEXT: v_mul_f32_e32 v1, s5, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, s4, v2 -; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_v2f32_arcp_math: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rcp_f32_e32 v0, s7 ; GFX11-NEXT: v_rcp_f32_e32 v2, s6 @@ -1448,7 +1446,7 @@ entry: define amdgpu_kernel void @s_fdiv_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-FASTFMA-LABEL: s_fdiv_v4f32: ; GFX6-FASTFMA: ; %bb.0: -; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX6-FASTFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-FASTFMA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX6-FASTFMA-NEXT: s_mov_b32 s11, 0xf000 @@ -1519,7 +1517,7 @@ define amdgpu_kernel void @s_fdiv_v4f32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX6-SLOWFMA-LABEL: s_fdiv_v4f32: ; GFX6-SLOWFMA: ; %bb.0: -; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-SLOWFMA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) @@ -1590,7 +1588,7 @@ define amdgpu_kernel void @s_fdiv_v4f32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX7-LABEL: s_fdiv_v4f32: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX7-NEXT: s_mov_b32 s11, 0xf000 @@ -1661,7 +1659,7 @@ define amdgpu_kernel void @s_fdiv_v4f32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX8-LABEL: s_fdiv_v4f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1732,7 +1730,7 @@ define amdgpu_kernel void @s_fdiv_v4f32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX10-LABEL: s_fdiv_v4f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1794,7 +1792,7 @@ define amdgpu_kernel void @s_fdiv_v4f32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX11-LABEL: s_fdiv_v4f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1895,7 +1893,7 @@ define amdgpu_kernel void @s_fdiv_v4f32(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @s_fdiv_v4f32_fast_math(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX67-LABEL: s_fdiv_v4f32_fast_math: ; GFX67: ; %bb.0: -; GFX67-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX67-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) ; GFX67-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX67-NEXT: s_mov_b32 s11, 0xf000 @@ -1914,7 +1912,7 @@ define amdgpu_kernel void @s_fdiv_v4f32_fast_math(ptr addrspace(1) %out, ptr add ; ; GFX8-LABEL: s_fdiv_v4f32_fast_math: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v4, s8 @@ -1933,7 +1931,7 @@ define amdgpu_kernel void @s_fdiv_v4f32_fast_math(ptr addrspace(1) %out, ptr add ; ; GFX10-LABEL: s_fdiv_v4f32_fast_math: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -1951,7 +1949,7 @@ define amdgpu_kernel void @s_fdiv_v4f32_fast_math(ptr addrspace(1) %out, ptr add ; ; GFX11-LABEL: s_fdiv_v4f32_fast_math: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -2003,7 +2001,7 @@ define amdgpu_kernel void @s_fdiv_v4f32_fast_math(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @s_fdiv_v4f32_arcp_math(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX67-LABEL: s_fdiv_v4f32_arcp_math: ; GFX67: ; %bb.0: -; GFX67-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX67-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) ; GFX67-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX67-NEXT: s_mov_b32 s11, 0xf000 @@ -2022,7 +2020,7 @@ define amdgpu_kernel void @s_fdiv_v4f32_arcp_math(ptr addrspace(1) %out, ptr add ; ; GFX8-LABEL: s_fdiv_v4f32_arcp_math: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v4, s8 @@ -2041,7 +2039,7 @@ define amdgpu_kernel void @s_fdiv_v4f32_arcp_math(ptr addrspace(1) %out, ptr add ; ; GFX10-LABEL: s_fdiv_v4f32_arcp_math: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -2059,7 +2057,7 @@ define amdgpu_kernel void @s_fdiv_v4f32_arcp_math(ptr addrspace(1) %out, ptr add ; ; GFX11-LABEL: s_fdiv_v4f32_arcp_math: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -2111,8 +2109,8 @@ define amdgpu_kernel void @s_fdiv_v4f32_arcp_math(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspace(1) %out, float %a) #0 { ; GFX6-FASTFMA-LABEL: s_fdiv_f32_correctly_rounded_divide_sqrt: ; GFX6-FASTFMA: ; %bb.0: ; %entry -; GFX6-FASTFMA-NEXT: s_load_dword s6, s[2:3], 0xb -; GFX6-FASTFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-FASTFMA-NEXT: s_load_dword s6, s[0:1], 0xb +; GFX6-FASTFMA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-FASTFMA-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-FASTFMA-NEXT: s_mov_b32 s2, -1 ; GFX6-FASTFMA-NEXT: s_waitcnt lgkmcnt(0) @@ -2134,11 +2132,11 @@ define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspac ; ; GFX6-SLOWFMA-LABEL: s_fdiv_f32_correctly_rounded_divide_sqrt: ; GFX6-SLOWFMA: ; %bb.0: ; %entry -; GFX6-SLOWFMA-NEXT: s_load_dword s4, s[2:3], 0xb +; GFX6-SLOWFMA-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-SLOWFMA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v0, s[0:1], s4, s4, 1.0 +; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v0, s[2:3], s4, s4, 1.0 ; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v1, vcc, 1.0, s4, 1.0 -; GFX6-SLOWFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s2, -1 ; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v2, v0 @@ -2152,14 +2150,13 @@ define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspac ; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v0, v0, v2, v3 ; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v0, s4, 1.0 -; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-SLOWFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-SLOWFMA-NEXT: s_endpgm ; ; GFX7-LABEL: s_fdiv_f32_correctly_rounded_divide_sqrt: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s6, s[2:3], 0xb -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7-NEXT: s_load_dword s6, s[0:1], 0xb +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2181,11 +2178,11 @@ define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspac ; ; GFX8-LABEL: s_fdiv_f32_correctly_rounded_divide_sqrt: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX8-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_div_scale_f32 v0, s[0:1], s4, s4, 1.0 +; GFX8-NEXT: v_div_scale_f32 v0, s[2:3], s4, s4, 1.0 ; GFX8-NEXT: v_div_scale_f32 v1, vcc, 1.0, s4, 1.0 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_rcp_f32_e32 v2, v0 ; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX8-NEXT: v_fma_f32 v3, -v0, v2, 1.0 @@ -2197,7 +2194,6 @@ define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspac ; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX8-NEXT: v_div_fmas_f32 v0, v0, v2, v3 ; GFX8-NEXT: v_div_fixup_f32 v2, v0, s4, 1.0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -2205,11 +2201,11 @@ define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspac ; ; GFX10-LABEL: s_fdiv_f32_correctly_rounded_divide_sqrt: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s0, s4, s4, 1.0 -; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s4, 1.0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: v_div_scale_f32 v0, s3, s2, s2, 1.0 +; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s2, 1.0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_rcp_f32_e32 v1, v0 ; GFX10-NEXT: s_denorm_mode 15 ; GFX10-NEXT: v_fma_f32 v3, -v0, v1, 1.0 @@ -2221,7 +2217,7 @@ define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspac ; GFX10-NEXT: s_denorm_mode 12 ; GFX10-NEXT: v_div_fmas_f32 v0, v0, v1, v3 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_div_fixup_f32 v0, v0, s4, 1.0 +; GFX10-NEXT: v_div_fixup_f32 v0, v0, s2, 1.0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm @@ -2229,11 +2225,11 @@ define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspac ; GFX11-LABEL: s_fdiv_f32_correctly_rounded_divide_sqrt: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v0, null, s4, s4, 1.0 -; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s4, 1.0 +; GFX11-NEXT: v_div_scale_f32 v0, null, s2, s2, 1.0 +; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s2, 1.0 ; GFX11-NEXT: v_rcp_f32_e32 v1, v0 ; GFX11-NEXT: s_denorm_mode 15 ; GFX11-NEXT: s_waitcnt_depctr 0xfff @@ -2246,7 +2242,7 @@ define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspac ; GFX11-NEXT: s_denorm_mode 12 ; GFX11-NEXT: v_div_fmas_f32 v0, v0, v1, v3 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: v_div_fixup_f32 v0, v0, s4, 1.0 +; GFX11-NEXT: v_div_fixup_f32 v0, v0, s2, 1.0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -2271,8 +2267,8 @@ entry: define amdgpu_kernel void @s_fdiv_f32_denorms_correctly_rounded_divide_sqrt(ptr addrspace(1) %out, float %a) #1 { ; GFX6-FASTFMA-LABEL: s_fdiv_f32_denorms_correctly_rounded_divide_sqrt: ; GFX6-FASTFMA: ; %bb.0: ; %entry -; GFX6-FASTFMA-NEXT: s_load_dword s6, s[2:3], 0xb -; GFX6-FASTFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-FASTFMA-NEXT: s_load_dword s6, s[0:1], 0xb +; GFX6-FASTFMA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-FASTFMA-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-FASTFMA-NEXT: s_mov_b32 s2, -1 ; GFX6-FASTFMA-NEXT: s_waitcnt lgkmcnt(0) @@ -2292,11 +2288,11 @@ define amdgpu_kernel void @s_fdiv_f32_denorms_correctly_rounded_divide_sqrt(ptr ; ; GFX6-SLOWFMA-LABEL: s_fdiv_f32_denorms_correctly_rounded_divide_sqrt: ; GFX6-SLOWFMA: ; %bb.0: ; %entry -; GFX6-SLOWFMA-NEXT: s_load_dword s4, s[2:3], 0xb +; GFX6-SLOWFMA-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-SLOWFMA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v0, s[0:1], s4, s4, 1.0 +; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v0, s[2:3], s4, s4, 1.0 ; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v1, vcc, 1.0, s4, 1.0 -; GFX6-SLOWFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s2, -1 ; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v2, v0 @@ -2308,14 +2304,13 @@ define amdgpu_kernel void @s_fdiv_f32_denorms_correctly_rounded_divide_sqrt(ptr ; GFX6-SLOWFMA-NEXT: v_fma_f32 v0, -v0, v3, v1 ; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v0, v0, v2, v3 ; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v0, s4, 1.0 -; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-SLOWFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-SLOWFMA-NEXT: s_endpgm ; ; GFX7-LABEL: s_fdiv_f32_denorms_correctly_rounded_divide_sqrt: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s6, s[2:3], 0xb -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7-NEXT: s_load_dword s6, s[0:1], 0xb +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2335,11 +2330,11 @@ define amdgpu_kernel void @s_fdiv_f32_denorms_correctly_rounded_divide_sqrt(ptr ; ; GFX8-LABEL: s_fdiv_f32_denorms_correctly_rounded_divide_sqrt: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX8-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_div_scale_f32 v0, s[0:1], s4, s4, 1.0 +; GFX8-NEXT: v_div_scale_f32 v0, s[2:3], s4, s4, 1.0 ; GFX8-NEXT: v_div_scale_f32 v1, vcc, 1.0, s4, 1.0 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_rcp_f32_e32 v2, v0 ; GFX8-NEXT: v_fma_f32 v3, -v0, v2, 1.0 ; GFX8-NEXT: v_fma_f32 v2, v3, v2, v2 @@ -2349,7 +2344,6 @@ define amdgpu_kernel void @s_fdiv_f32_denorms_correctly_rounded_divide_sqrt(ptr ; GFX8-NEXT: v_fma_f32 v0, -v0, v3, v1 ; GFX8-NEXT: v_div_fmas_f32 v0, v0, v2, v3 ; GFX8-NEXT: v_div_fixup_f32 v2, v0, s4, 1.0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -2357,21 +2351,21 @@ define amdgpu_kernel void @s_fdiv_f32_denorms_correctly_rounded_divide_sqrt(ptr ; ; GFX10-LABEL: s_fdiv_f32_denorms_correctly_rounded_divide_sqrt: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s0, s4, s4, 1.0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: v_div_scale_f32 v0, s3, s2, s2, 1.0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_rcp_f32_e32 v1, v0 ; GFX10-NEXT: v_fma_f32 v2, -v0, v1, 1.0 ; GFX10-NEXT: v_fmac_f32_e32 v1, v2, v1 -; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s4, 1.0 +; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s2, 1.0 ; GFX10-NEXT: v_mul_f32_e32 v3, v2, v1 ; GFX10-NEXT: v_fma_f32 v4, -v0, v3, v2 ; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v1 ; GFX10-NEXT: v_fma_f32 v0, -v0, v3, v2 ; GFX10-NEXT: v_div_fmas_f32 v0, v0, v1, v3 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_div_fixup_f32 v0, v0, s4, 1.0 +; GFX10-NEXT: v_div_fixup_f32 v0, v0, s2, 1.0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm @@ -2379,22 +2373,22 @@ define amdgpu_kernel void @s_fdiv_f32_denorms_correctly_rounded_divide_sqrt(ptr ; GFX11-LABEL: s_fdiv_f32_denorms_correctly_rounded_divide_sqrt: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v0, null, s4, s4, 1.0 +; GFX11-NEXT: v_div_scale_f32 v0, null, s2, s2, 1.0 ; GFX11-NEXT: v_rcp_f32_e32 v1, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_fma_f32 v2, -v0, v1, 1.0 ; GFX11-NEXT: v_fmac_f32_e32 v1, v2, v1 -; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s4, 1.0 +; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s2, 1.0 ; GFX11-NEXT: v_mul_f32_e32 v3, v2, v1 ; GFX11-NEXT: v_fma_f32 v4, -v0, v3, v2 ; GFX11-NEXT: v_fmac_f32_e32 v3, v4, v1 ; GFX11-NEXT: v_fma_f32 v0, -v0, v3, v2 ; GFX11-NEXT: v_div_fmas_f32 v0, v0, v1, v3 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: v_div_fixup_f32 v0, v0, s4, 1.0 +; GFX11-NEXT: v_div_fixup_f32 v0, v0, s2, 1.0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll index c9618d43943ef..26714dcc6dfac 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll @@ -6,14 +6,14 @@ define amdgpu_kernel void @atomic_add_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_add_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 16 +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_add v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -21,14 +21,14 @@ define amdgpu_kernel void @atomic_add_i32_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_add_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 16 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 16 +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_add v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -36,11 +36,11 @@ define amdgpu_kernel void @atomic_add_i32_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_add_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_add v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -55,14 +55,14 @@ entry: define amdgpu_kernel void @atomic_add_i32_max_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_add_i32_max_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 0xffc -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 0xffc +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_add v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -70,14 +70,14 @@ define amdgpu_kernel void @atomic_add_i32_max_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_add_i32_max_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 0xffc -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 0xffc +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_add v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -85,11 +85,11 @@ define amdgpu_kernel void @atomic_add_i32_max_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_add_i32_max_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_add v[0:1], v2 offset:4092 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -104,14 +104,14 @@ entry: define amdgpu_kernel void @atomic_add_i32_max_offset_p1(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_add_i32_max_offset_p1: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 0x1000 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 0x1000 +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_add v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -119,14 +119,14 @@ define amdgpu_kernel void @atomic_add_i32_max_offset_p1(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_add_i32_max_offset_p1: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 0x1000 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 0x1000 +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_add v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -134,11 +134,11 @@ define amdgpu_kernel void @atomic_add_i32_max_offset_p1(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_add_i32_max_offset_p1: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: v_mov_b32_e32 v2, s4 @@ -155,8 +155,8 @@ entry: define amdgpu_kernel void @atomic_add_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_add_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, 16 ; GCN1-NEXT: s_addc_u32 s1, s5, 0 @@ -173,8 +173,8 @@ define amdgpu_kernel void @atomic_add_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_add_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, 16 ; GCN2-NEXT: s_addc_u32 s1, s5, 0 @@ -191,12 +191,12 @@ define amdgpu_kernel void @atomic_add_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_add_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_add v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -214,18 +214,18 @@ entry: define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_add_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_add v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -233,18 +233,18 @@ define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr %out, i32 %in, i64 % ; ; GCN2-LABEL: atomic_add_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_add v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -252,11 +252,11 @@ define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr %out, i32 %in, i64 % ; ; GCN3-LABEL: atomic_add_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -276,18 +276,18 @@ entry: define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_add_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_add v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -298,18 +298,18 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_add_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_add v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -320,11 +320,11 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN3-LABEL: atomic_add_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -348,12 +348,12 @@ entry: define amdgpu_kernel void @atomic_add_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_add_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_add v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -361,12 +361,12 @@ define amdgpu_kernel void @atomic_add_i32(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_add_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_add v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -374,11 +374,11 @@ define amdgpu_kernel void @atomic_add_i32(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_add_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_add v[0:1], v2 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -392,8 +392,8 @@ entry: define amdgpu_kernel void @atomic_add_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_add_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -408,8 +408,8 @@ define amdgpu_kernel void @atomic_add_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN2-LABEL: atomic_add_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -424,12 +424,12 @@ define amdgpu_kernel void @atomic_add_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN3-LABEL: atomic_add_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_add v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -446,16 +446,16 @@ entry: define amdgpu_kernel void @atomic_add_i32_addr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_add_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_add v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -463,16 +463,16 @@ define amdgpu_kernel void @atomic_add_i32_addr64(ptr %out, i32 %in, i64 %index) ; ; GCN2-LABEL: atomic_add_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_add v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -480,11 +480,11 @@ define amdgpu_kernel void @atomic_add_i32_addr64(ptr %out, i32 %in, i64 %index) ; ; GCN3-LABEL: atomic_add_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -503,16 +503,16 @@ entry: define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_add_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_add v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -523,16 +523,16 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_add_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_add v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -543,11 +543,11 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_add_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -570,14 +570,14 @@ entry: define amdgpu_kernel void @atomic_and_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_and_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 16 +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_and v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -585,14 +585,14 @@ define amdgpu_kernel void @atomic_and_i32_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_and_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 16 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 16 +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_and v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -600,11 +600,11 @@ define amdgpu_kernel void @atomic_and_i32_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_and_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_and v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -619,8 +619,8 @@ entry: define amdgpu_kernel void @atomic_and_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_and_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, 16 ; GCN1-NEXT: s_addc_u32 s1, s5, 0 @@ -637,8 +637,8 @@ define amdgpu_kernel void @atomic_and_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_and_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, 16 ; GCN2-NEXT: s_addc_u32 s1, s5, 0 @@ -655,12 +655,12 @@ define amdgpu_kernel void @atomic_and_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_and_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_and v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -678,18 +678,18 @@ entry: define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_and_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_and v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -697,18 +697,18 @@ define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr %out, i32 %in, i64 % ; ; GCN2-LABEL: atomic_and_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_and v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -716,11 +716,11 @@ define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr %out, i32 %in, i64 % ; ; GCN3-LABEL: atomic_and_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -740,18 +740,18 @@ entry: define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_and_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_and v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -762,18 +762,18 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_and_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_and v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -784,11 +784,11 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN3-LABEL: atomic_and_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -812,12 +812,12 @@ entry: define amdgpu_kernel void @atomic_and_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_and_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_and v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -825,12 +825,12 @@ define amdgpu_kernel void @atomic_and_i32(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_and_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_and v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -838,11 +838,11 @@ define amdgpu_kernel void @atomic_and_i32(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_and_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_and v[0:1], v2 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -856,8 +856,8 @@ entry: define amdgpu_kernel void @atomic_and_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_and_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -872,8 +872,8 @@ define amdgpu_kernel void @atomic_and_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN2-LABEL: atomic_and_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -888,12 +888,12 @@ define amdgpu_kernel void @atomic_and_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN3-LABEL: atomic_and_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_and v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -910,16 +910,16 @@ entry: define amdgpu_kernel void @atomic_and_i32_addr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_and_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_and v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -927,16 +927,16 @@ define amdgpu_kernel void @atomic_and_i32_addr64(ptr %out, i32 %in, i64 %index) ; ; GCN2-LABEL: atomic_and_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_and v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -944,11 +944,11 @@ define amdgpu_kernel void @atomic_and_i32_addr64(ptr %out, i32 %in, i64 %index) ; ; GCN3-LABEL: atomic_and_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -967,16 +967,16 @@ entry: define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_and_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_and v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -987,16 +987,16 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_and_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_and v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -1007,11 +1007,11 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_and_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -1034,14 +1034,14 @@ entry: define amdgpu_kernel void @atomic_sub_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_sub_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 16 +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_sub v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -1049,14 +1049,14 @@ define amdgpu_kernel void @atomic_sub_i32_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_sub_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 16 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 16 +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_sub v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -1064,11 +1064,11 @@ define amdgpu_kernel void @atomic_sub_i32_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_sub_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_sub v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1083,8 +1083,8 @@ entry: define amdgpu_kernel void @atomic_sub_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_sub_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, 16 ; GCN1-NEXT: s_addc_u32 s1, s5, 0 @@ -1101,8 +1101,8 @@ define amdgpu_kernel void @atomic_sub_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_sub_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, 16 ; GCN2-NEXT: s_addc_u32 s1, s5, 0 @@ -1119,12 +1119,12 @@ define amdgpu_kernel void @atomic_sub_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_sub_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_sub v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -1142,18 +1142,18 @@ entry: define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_sub_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_sub v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -1161,18 +1161,18 @@ define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr %out, i32 %in, i64 % ; ; GCN2-LABEL: atomic_sub_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_sub v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -1180,11 +1180,11 @@ define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr %out, i32 %in, i64 % ; ; GCN3-LABEL: atomic_sub_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -1204,18 +1204,18 @@ entry: define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_sub_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_sub v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -1226,18 +1226,18 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_sub_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_sub v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -1248,11 +1248,11 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN3-LABEL: atomic_sub_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -1276,12 +1276,12 @@ entry: define amdgpu_kernel void @atomic_sub_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_sub_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_sub v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -1289,12 +1289,12 @@ define amdgpu_kernel void @atomic_sub_i32(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_sub_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_sub v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -1302,11 +1302,11 @@ define amdgpu_kernel void @atomic_sub_i32(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_sub_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_sub v[0:1], v2 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1320,8 +1320,8 @@ entry: define amdgpu_kernel void @atomic_sub_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_sub_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -1336,8 +1336,8 @@ define amdgpu_kernel void @atomic_sub_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN2-LABEL: atomic_sub_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -1352,12 +1352,12 @@ define amdgpu_kernel void @atomic_sub_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN3-LABEL: atomic_sub_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_sub v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -1374,16 +1374,16 @@ entry: define amdgpu_kernel void @atomic_sub_i32_addr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_sub_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_sub v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -1391,16 +1391,16 @@ define amdgpu_kernel void @atomic_sub_i32_addr64(ptr %out, i32 %in, i64 %index) ; ; GCN2-LABEL: atomic_sub_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_sub v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -1408,11 +1408,11 @@ define amdgpu_kernel void @atomic_sub_i32_addr64(ptr %out, i32 %in, i64 %index) ; ; GCN3-LABEL: atomic_sub_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -1431,16 +1431,16 @@ entry: define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_sub_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_sub v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -1451,16 +1451,16 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_sub_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_sub v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -1471,11 +1471,11 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_sub_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -1498,39 +1498,39 @@ entry: define amdgpu_kernel void @atomic_max_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_max_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 16 +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_smax v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_max_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 16 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 16 +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_smax v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_max_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_smax v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) @@ -1544,8 +1544,8 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_max_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, 16 ; GCN1-NEXT: s_addc_u32 s1, s5, 0 @@ -1562,8 +1562,8 @@ define amdgpu_kernel void @atomic_max_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_max_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, 16 ; GCN2-NEXT: s_addc_u32 s1, s5, 0 @@ -1580,12 +1580,12 @@ define amdgpu_kernel void @atomic_max_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_max_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_smax v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s6 @@ -1603,47 +1603,47 @@ entry: define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_max_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_smax v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_max_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_smax v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_max_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -1662,18 +1662,18 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_max_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_smax v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 @@ -1684,18 +1684,18 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_max_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_smax v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 @@ -1706,11 +1706,11 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN3-LABEL: atomic_max_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -1734,35 +1734,35 @@ entry: define amdgpu_kernel void @atomic_max_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_max_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_smax v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_max_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_smax v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_max_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_smax v[0:1], v2 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) @@ -1775,8 +1775,8 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_max_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -1791,8 +1791,8 @@ define amdgpu_kernel void @atomic_max_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN2-LABEL: atomic_max_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -1807,12 +1807,12 @@ define amdgpu_kernel void @atomic_max_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN3-LABEL: atomic_max_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_smax v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s6 @@ -1829,43 +1829,43 @@ entry: define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_max_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_smax v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_max_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_smax v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_max_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -1883,16 +1883,16 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_max_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_smax v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 @@ -1903,16 +1903,16 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_max_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_smax v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 @@ -1923,11 +1923,11 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_max_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -1950,39 +1950,39 @@ entry: define amdgpu_kernel void @atomic_umax_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_umax_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 16 +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_umax v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umax_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 16 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 16 +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_umax v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_umax_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_umax v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) @@ -1996,8 +1996,8 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_umax_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, 16 ; GCN1-NEXT: s_addc_u32 s1, s5, 0 @@ -2014,8 +2014,8 @@ define amdgpu_kernel void @atomic_umax_i32_ret_offset(ptr %out, ptr %out2, i32 % ; ; GCN2-LABEL: atomic_umax_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, 16 ; GCN2-NEXT: s_addc_u32 s1, s5, 0 @@ -2032,12 +2032,12 @@ define amdgpu_kernel void @atomic_umax_i32_ret_offset(ptr %out, ptr %out2, i32 % ; ; GCN3-LABEL: atomic_umax_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_umax v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s6 @@ -2055,47 +2055,47 @@ entry: define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_umax_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_umax v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umax_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_umax v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_umax_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -2114,18 +2114,18 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_umax_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_umax v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 @@ -2136,18 +2136,18 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2 ; ; GCN2-LABEL: atomic_umax_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_umax v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 @@ -2158,11 +2158,11 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2 ; ; GCN3-LABEL: atomic_umax_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -2186,35 +2186,35 @@ entry: define amdgpu_kernel void @atomic_umax_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_umax_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_umax v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umax_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_umax v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_umax_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_umax v[0:1], v2 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) @@ -2227,8 +2227,8 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_umax_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -2243,8 +2243,8 @@ define amdgpu_kernel void @atomic_umax_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN2-LABEL: atomic_umax_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -2259,12 +2259,12 @@ define amdgpu_kernel void @atomic_umax_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN3-LABEL: atomic_umax_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_umax v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s6 @@ -2281,43 +2281,43 @@ entry: define amdgpu_kernel void @atomic_umax_i32_addr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_umax_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_umax v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umax_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_umax v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_umax_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -2335,16 +2335,16 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_umax_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_umax v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 @@ -2355,16 +2355,16 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; ; GCN2-LABEL: atomic_umax_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_umax v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 @@ -2375,11 +2375,11 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; ; GCN3-LABEL: atomic_umax_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -2402,39 +2402,39 @@ entry: define amdgpu_kernel void @atomic_min_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_min_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 16 +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_smin v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 16 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 16 +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_smin v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_min_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_smin v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) @@ -2448,8 +2448,8 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_min_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, 16 ; GCN1-NEXT: s_addc_u32 s1, s5, 0 @@ -2466,8 +2466,8 @@ define amdgpu_kernel void @atomic_min_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_min_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, 16 ; GCN2-NEXT: s_addc_u32 s1, s5, 0 @@ -2484,12 +2484,12 @@ define amdgpu_kernel void @atomic_min_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_min_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_smin v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s6 @@ -2507,47 +2507,47 @@ entry: define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_min_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_smin v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_smin v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_min_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -2566,18 +2566,18 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_min_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_smin v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 @@ -2588,18 +2588,18 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_min_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_smin v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 @@ -2610,11 +2610,11 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN3-LABEL: atomic_min_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -2638,35 +2638,35 @@ entry: define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_min_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_smin v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_smin v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_min_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_smin v[0:1], v2 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) @@ -2679,8 +2679,8 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_min_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -2695,8 +2695,8 @@ define amdgpu_kernel void @atomic_min_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN2-LABEL: atomic_min_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -2711,12 +2711,12 @@ define amdgpu_kernel void @atomic_min_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN3-LABEL: atomic_min_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_smin v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s6 @@ -2733,43 +2733,43 @@ entry: define amdgpu_kernel void @atomic_min_i32_addr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_min_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_smin v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_smin v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_min_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -2787,16 +2787,16 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_min_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_smin v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 @@ -2807,16 +2807,16 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_min_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_smin v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 @@ -2827,11 +2827,11 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_min_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -2854,39 +2854,39 @@ entry: define amdgpu_kernel void @atomic_umin_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_umin_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 16 +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_umin v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umin_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 16 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 16 +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_umin v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_umin_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_umin v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) @@ -2900,8 +2900,8 @@ entry: define amdgpu_kernel void @atomic_umin_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_umin_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, 16 ; GCN1-NEXT: s_addc_u32 s1, s5, 0 @@ -2918,8 +2918,8 @@ define amdgpu_kernel void @atomic_umin_i32_ret_offset(ptr %out, ptr %out2, i32 % ; ; GCN2-LABEL: atomic_umin_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, 16 ; GCN2-NEXT: s_addc_u32 s1, s5, 0 @@ -2936,12 +2936,12 @@ define amdgpu_kernel void @atomic_umin_i32_ret_offset(ptr %out, ptr %out2, i32 % ; ; GCN3-LABEL: atomic_umin_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_umin v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s6 @@ -2959,47 +2959,47 @@ entry: define amdgpu_kernel void @atomic_umin_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_umin_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_umin v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umin_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_umin v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_umin_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -3018,18 +3018,18 @@ entry: define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_umin_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_umin v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 @@ -3040,18 +3040,18 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr %out, ptr %out2 ; ; GCN2-LABEL: atomic_umin_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_umin v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 @@ -3062,11 +3062,11 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr %out, ptr %out2 ; ; GCN3-LABEL: atomic_umin_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -3090,35 +3090,35 @@ entry: define amdgpu_kernel void @atomic_umin_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_umin_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_umin v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umin_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_umin v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_umin_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_umin v[0:1], v2 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) @@ -3131,8 +3131,8 @@ entry: define amdgpu_kernel void @atomic_umin_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_umin_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -3147,8 +3147,8 @@ define amdgpu_kernel void @atomic_umin_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN2-LABEL: atomic_umin_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -3163,12 +3163,12 @@ define amdgpu_kernel void @atomic_umin_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN3-LABEL: atomic_umin_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_umin v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s6 @@ -3185,43 +3185,43 @@ entry: define amdgpu_kernel void @atomic_umin_i32_addr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_umin_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_umin v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umin_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_umin v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_umin_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -3239,16 +3239,16 @@ entry: define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_umin_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_umin v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 @@ -3259,16 +3259,16 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; ; GCN2-LABEL: atomic_umin_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_umin v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 @@ -3279,11 +3279,11 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; ; GCN3-LABEL: atomic_umin_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -3306,14 +3306,14 @@ entry: define amdgpu_kernel void @atomic_or_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_or_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 16 +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_or v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -3321,14 +3321,14 @@ define amdgpu_kernel void @atomic_or_i32_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_or_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 16 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 16 +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_or v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -3336,11 +3336,11 @@ define amdgpu_kernel void @atomic_or_i32_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_or_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_or v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3355,8 +3355,8 @@ entry: define amdgpu_kernel void @atomic_or_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_or_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, 16 ; GCN1-NEXT: s_addc_u32 s1, s5, 0 @@ -3373,8 +3373,8 @@ define amdgpu_kernel void @atomic_or_i32_ret_offset(ptr %out, ptr %out2, i32 %in ; ; GCN2-LABEL: atomic_or_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, 16 ; GCN2-NEXT: s_addc_u32 s1, s5, 0 @@ -3391,12 +3391,12 @@ define amdgpu_kernel void @atomic_or_i32_ret_offset(ptr %out, ptr %out2, i32 %in ; ; GCN3-LABEL: atomic_or_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_or v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -3414,18 +3414,18 @@ entry: define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_or_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_or v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -3433,18 +3433,18 @@ define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr %out, i32 %in, i64 %i ; ; GCN2-LABEL: atomic_or_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_or v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -3452,11 +3452,11 @@ define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr %out, i32 %in, i64 %i ; ; GCN3-LABEL: atomic_or_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -3476,18 +3476,18 @@ entry: define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_or_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_or v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -3498,18 +3498,18 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_or_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_or v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -3520,11 +3520,11 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN3-LABEL: atomic_or_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -3548,12 +3548,12 @@ entry: define amdgpu_kernel void @atomic_or_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_or_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_or v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -3561,12 +3561,12 @@ define amdgpu_kernel void @atomic_or_i32(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_or_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_or v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -3574,11 +3574,11 @@ define amdgpu_kernel void @atomic_or_i32(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_or_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_or v[0:1], v2 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3592,8 +3592,8 @@ entry: define amdgpu_kernel void @atomic_or_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_or_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -3608,8 +3608,8 @@ define amdgpu_kernel void @atomic_or_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN2-LABEL: atomic_or_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -3624,12 +3624,12 @@ define amdgpu_kernel void @atomic_or_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN3-LABEL: atomic_or_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_or v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -3646,16 +3646,16 @@ entry: define amdgpu_kernel void @atomic_or_i32_addr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_or_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_or v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -3663,16 +3663,16 @@ define amdgpu_kernel void @atomic_or_i32_addr64(ptr %out, i32 %in, i64 %index) { ; ; GCN2-LABEL: atomic_or_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_or v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -3680,11 +3680,11 @@ define amdgpu_kernel void @atomic_or_i32_addr64(ptr %out, i32 %in, i64 %index) { ; ; GCN3-LABEL: atomic_or_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -3703,16 +3703,16 @@ entry: define amdgpu_kernel void @atomic_or_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_or_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_or v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -3723,16 +3723,16 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64(ptr %out, ptr %out2, i32 %in ; ; GCN2-LABEL: atomic_or_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_or v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -3743,11 +3743,11 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64(ptr %out, ptr %out2, i32 %in ; ; GCN3-LABEL: atomic_or_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -3770,14 +3770,14 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_xchg_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 16 +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_swap v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -3785,14 +3785,14 @@ define amdgpu_kernel void @atomic_xchg_i32_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_xchg_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 16 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 16 +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_swap v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -3800,11 +3800,11 @@ define amdgpu_kernel void @atomic_xchg_i32_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_xchg_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_swap v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3819,14 +3819,14 @@ entry: define amdgpu_kernel void @atomic_xchg_f32_offset(ptr %out, float %in) { ; GCN1-LABEL: atomic_xchg_f32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 16 +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_swap v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -3834,14 +3834,14 @@ define amdgpu_kernel void @atomic_xchg_f32_offset(ptr %out, float %in) { ; ; GCN2-LABEL: atomic_xchg_f32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 16 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 16 +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_swap v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -3849,11 +3849,11 @@ define amdgpu_kernel void @atomic_xchg_f32_offset(ptr %out, float %in) { ; ; GCN3-LABEL: atomic_xchg_f32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_swap v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3868,8 +3868,8 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_xchg_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, 16 ; GCN1-NEXT: s_addc_u32 s1, s5, 0 @@ -3886,8 +3886,8 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_offset(ptr %out, ptr %out2, i32 % ; ; GCN2-LABEL: atomic_xchg_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, 16 ; GCN2-NEXT: s_addc_u32 s1, s5, 0 @@ -3904,12 +3904,12 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_offset(ptr %out, ptr %out2, i32 % ; ; GCN3-LABEL: atomic_xchg_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_swap v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -3927,18 +3927,18 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_xchg_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_swap v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -3946,18 +3946,18 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(ptr %out, i32 %in, i64 ; ; GCN2-LABEL: atomic_xchg_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_swap v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -3965,11 +3965,11 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(ptr %out, i32 %in, i64 ; ; GCN3-LABEL: atomic_xchg_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -3989,18 +3989,18 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_xchg_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -4011,18 +4011,18 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(ptr %out, ptr %out2 ; ; GCN2-LABEL: atomic_xchg_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -4033,11 +4033,11 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(ptr %out, ptr %out2 ; ; GCN3-LABEL: atomic_xchg_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -4061,12 +4061,12 @@ entry: define amdgpu_kernel void @atomic_xchg_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_xchg_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_swap v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -4074,12 +4074,12 @@ define amdgpu_kernel void @atomic_xchg_i32(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_xchg_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_swap v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -4087,11 +4087,11 @@ define amdgpu_kernel void @atomic_xchg_i32(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_xchg_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_swap v[0:1], v2 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4105,8 +4105,8 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_xchg_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -4121,8 +4121,8 @@ define amdgpu_kernel void @atomic_xchg_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN2-LABEL: atomic_xchg_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -4137,12 +4137,12 @@ define amdgpu_kernel void @atomic_xchg_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN3-LABEL: atomic_xchg_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -4159,16 +4159,16 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_addr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_xchg_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_swap v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -4176,16 +4176,16 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64(ptr %out, i32 %in, i64 %index) ; ; GCN2-LABEL: atomic_xchg_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_swap v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -4193,11 +4193,11 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64(ptr %out, i32 %in, i64 %index) ; ; GCN3-LABEL: atomic_xchg_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -4216,16 +4216,16 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_xchg_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -4236,16 +4236,16 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; ; GCN2-LABEL: atomic_xchg_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -4256,11 +4256,11 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; ; GCN3-LABEL: atomic_xchg_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -4285,7 +4285,7 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr %out, i32 %in, i32 %old) { ; GCN1-LABEL: atomic_cmpxchg_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -4300,7 +4300,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr %out, i32 %in, i32 %old ; ; GCN2-LABEL: atomic_cmpxchg_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -4315,12 +4315,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr %out, i32 %in, i32 %old ; ; GCN3-LABEL: atomic_cmpxchg_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v3, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 ; GCN3-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -4334,8 +4334,8 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(ptr %out, ptr %out2, i32 %in, i32 %old) { ; GCN1-LABEL: atomic_cmpxchg_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s2, s4, 16 ; GCN1-NEXT: s_addc_u32 s3, s5, 0 @@ -4353,8 +4353,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(ptr %out, ptr %out2, i3 ; ; GCN2-LABEL: atomic_cmpxchg_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s2, s4, 16 ; GCN2-NEXT: s_addc_u32 s3, s5, 0 @@ -4372,13 +4372,13 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(ptr %out, ptr %out2, i3 ; ; GCN3-LABEL: atomic_cmpxchg_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s1 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v3, s3 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -4397,19 +4397,19 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr %out, i32 %in, i64 %index, i32 %old) { ; GCN1-LABEL: atomic_cmpxchg_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dword s6, s[2:3], 0xb -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xf +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s7, s[0:1], 0xf ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s7 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_cmpswap v[2:3], v[0:1] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4418,19 +4418,19 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr %out, i32 %in, i ; ; GCN2-LABEL: atomic_cmpxchg_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dword s6, s[2:3], 0x2c -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x3c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s7, s[0:1], 0x3c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s7 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_cmpswap v[2:3], v[0:1] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4439,12 +4439,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr %out, i32 %in, i ; ; GCN3-LABEL: atomic_cmpxchg_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s7, s[2:3], 0x3c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s7, s[0:1], 0x3c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 @@ -4465,19 +4465,19 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index, i32 %old) { ; GCN1-LABEL: atomic_cmpxchg_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dword s8, s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x11 +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s9, s[0:1], 0x11 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: v_mov_b32_e32 v0, s8 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s9 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4489,19 +4489,19 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr %out, ptr %o ; ; GCN2-LABEL: atomic_cmpxchg_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dword s8, s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x44 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s9, s[0:1], 0x44 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: v_mov_b32_e32 v0, s8 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s9 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4513,12 +4513,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr %out, ptr %o ; ; GCN3-LABEL: atomic_cmpxchg_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s9, s[2:3], 0x44 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s9, s[0:1], 0x44 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: v_mov_b32_e32 v0, s8 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 @@ -4544,7 +4544,7 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32(ptr %out, i32 %in, i32 %old) { ; GCN1-LABEL: atomic_cmpxchg_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v2, s2 @@ -4557,7 +4557,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32(ptr %out, i32 %in, i32 %old) { ; ; GCN2-LABEL: atomic_cmpxchg_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v2, s2 @@ -4570,12 +4570,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32(ptr %out, i32 %in, i32 %old) { ; ; GCN3-LABEL: atomic_cmpxchg_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v3, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 ; GCN3-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -4588,8 +4588,8 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_ret(ptr %out, ptr %out2, i32 %in, i32 %old) { ; GCN1-LABEL: atomic_cmpxchg_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 @@ -4605,8 +4605,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret(ptr %out, ptr %out2, i32 %in, ; ; GCN2-LABEL: atomic_cmpxchg_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 @@ -4622,13 +4622,13 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret(ptr %out, ptr %out2, i32 %in, ; ; GCN3-LABEL: atomic_cmpxchg_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s1 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v3, s3 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -4646,17 +4646,17 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr %out, i32 %in, i64 %index, i32 %old) { ; GCN1-LABEL: atomic_cmpxchg_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dword s6, s[2:3], 0xb -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xf +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s7, s[0:1], 0xf ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s7 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_cmpswap v[2:3], v[0:1] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4665,17 +4665,17 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr %out, i32 %in, i64 %ind ; ; GCN2-LABEL: atomic_cmpxchg_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dword s6, s[2:3], 0x2c -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x3c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s7, s[0:1], 0x3c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s7 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_cmpswap v[2:3], v[0:1] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4684,12 +4684,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr %out, i32 %in, i64 %ind ; ; GCN3-LABEL: atomic_cmpxchg_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s7, s[2:3], 0x3c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s7, s[0:1], 0x3c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 @@ -4709,17 +4709,17 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index, i32 %old) { ; GCN1-LABEL: atomic_cmpxchg_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dword s8, s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x11 +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s9, s[0:1], 0x11 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: v_mov_b32_e32 v0, s8 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s9 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4731,17 +4731,17 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr %out, ptr %out2, i3 ; ; GCN2-LABEL: atomic_cmpxchg_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dword s8, s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x44 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s9, s[0:1], 0x44 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: v_mov_b32_e32 v0, s8 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s9 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4753,12 +4753,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr %out, ptr %out2, i3 ; ; GCN3-LABEL: atomic_cmpxchg_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s9, s[2:3], 0x44 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s9, s[0:1], 0x44 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: v_mov_b32_e32 v0, s8 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 @@ -4783,14 +4783,14 @@ entry: define amdgpu_kernel void @atomic_xor_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_xor_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 16 +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_xor v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -4798,14 +4798,14 @@ define amdgpu_kernel void @atomic_xor_i32_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_xor_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 16 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 16 +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_xor v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -4813,11 +4813,11 @@ define amdgpu_kernel void @atomic_xor_i32_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_xor_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_xor v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4832,8 +4832,8 @@ entry: define amdgpu_kernel void @atomic_xor_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_xor_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, 16 ; GCN1-NEXT: s_addc_u32 s1, s5, 0 @@ -4850,8 +4850,8 @@ define amdgpu_kernel void @atomic_xor_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_xor_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, 16 ; GCN2-NEXT: s_addc_u32 s1, s5, 0 @@ -4868,12 +4868,12 @@ define amdgpu_kernel void @atomic_xor_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_xor_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_xor v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -4891,18 +4891,18 @@ entry: define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_xor_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_xor v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -4910,18 +4910,18 @@ define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr %out, i32 %in, i64 % ; ; GCN2-LABEL: atomic_xor_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_xor v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -4929,11 +4929,11 @@ define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr %out, i32 %in, i64 % ; ; GCN3-LABEL: atomic_xor_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -4953,18 +4953,18 @@ entry: define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_xor_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_xor v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -4975,18 +4975,18 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_xor_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_xor v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -4997,11 +4997,11 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN3-LABEL: atomic_xor_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -5025,12 +5025,12 @@ entry: define amdgpu_kernel void @atomic_xor_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_xor_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_xor v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -5038,12 +5038,12 @@ define amdgpu_kernel void @atomic_xor_i32(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_xor_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_xor v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -5051,11 +5051,11 @@ define amdgpu_kernel void @atomic_xor_i32(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_xor_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_xor v[0:1], v2 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5069,8 +5069,8 @@ entry: define amdgpu_kernel void @atomic_xor_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_xor_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -5085,8 +5085,8 @@ define amdgpu_kernel void @atomic_xor_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN2-LABEL: atomic_xor_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -5101,12 +5101,12 @@ define amdgpu_kernel void @atomic_xor_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN3-LABEL: atomic_xor_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_xor v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -5123,16 +5123,16 @@ entry: define amdgpu_kernel void @atomic_xor_i32_addr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_xor_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_xor v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -5140,16 +5140,16 @@ define amdgpu_kernel void @atomic_xor_i32_addr64(ptr %out, i32 %in, i64 %index) ; ; GCN2-LABEL: atomic_xor_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_xor v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -5157,11 +5157,11 @@ define amdgpu_kernel void @atomic_xor_i32_addr64(ptr %out, i32 %in, i64 %index) ; ; GCN3-LABEL: atomic_xor_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -5180,16 +5180,16 @@ entry: define amdgpu_kernel void @atomic_xor_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_xor_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_xor v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -5200,16 +5200,16 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_xor_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_xor v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -5220,11 +5220,11 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_xor_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -5247,7 +5247,7 @@ entry: define amdgpu_kernel void @atomic_load_i32_offset(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -5263,7 +5263,7 @@ define amdgpu_kernel void @atomic_load_i32_offset(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -5279,7 +5279,7 @@ define amdgpu_kernel void @atomic_load_i32_offset(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 @@ -5300,7 +5300,7 @@ entry: define amdgpu_kernel void @atomic_load_i32(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -5314,7 +5314,7 @@ define amdgpu_kernel void @atomic_load_i32(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -5328,7 +5328,7 @@ define amdgpu_kernel void @atomic_load_i32(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 @@ -5348,8 +5348,8 @@ entry: define amdgpu_kernel void @atomic_load_i32_addr64_offset(ptr %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_load_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 ; GCN1-NEXT: s_add_u32 s0, s0, s4 @@ -5368,8 +5368,8 @@ define amdgpu_kernel void @atomic_load_i32_addr64_offset(ptr %in, ptr %out, i64 ; ; GCN2-LABEL: atomic_load_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 ; GCN2-NEXT: s_add_u32 s0, s0, s4 @@ -5388,10 +5388,10 @@ define amdgpu_kernel void @atomic_load_i32_addr64_offset(ptr %in, ptr %out, i64 ; ; GCN3-LABEL: atomic_load_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -5414,8 +5414,8 @@ entry: define amdgpu_kernel void @atomic_load_i32_addr64(ptr %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_load_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 ; GCN1-NEXT: s_add_u32 s0, s0, s4 @@ -5432,8 +5432,8 @@ define amdgpu_kernel void @atomic_load_i32_addr64(ptr %in, ptr %out, i64 %index) ; ; GCN2-LABEL: atomic_load_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 ; GCN2-NEXT: s_add_u32 s0, s0, s4 @@ -5450,10 +5450,10 @@ define amdgpu_kernel void @atomic_load_i32_addr64(ptr %in, ptr %out, i64 %index) ; ; GCN3-LABEL: atomic_load_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -5475,37 +5475,37 @@ entry: define amdgpu_kernel void @atomic_store_i32_offset(i32 %in, ptr %out) { ; GCN1-LABEL: atomic_store_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; GCN1-NEXT: s_load_dword s4, s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 16 +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 16 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 16 +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_store_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_store_dword v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm @@ -5518,33 +5518,33 @@ entry: define amdgpu_kernel void @atomic_store_i32(i32 %in, ptr %out) { ; GCN1-LABEL: atomic_store_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; GCN1-NEXT: s_load_dword s0, s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_store_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm @@ -5556,8 +5556,8 @@ entry: define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_store_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GCN1-NEXT: s_load_dword s2, s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 @@ -5572,8 +5572,8 @@ define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, ptr %out, i64 ; ; GCN2-LABEL: atomic_store_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GCN2-NEXT: s_load_dword s2, s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 @@ -5588,15 +5588,15 @@ define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, ptr %out, i64 ; ; GCN3-LABEL: atomic_store_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s8 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_store_dword v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm entry: @@ -5609,8 +5609,8 @@ entry: define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_store_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GCN1-NEXT: s_load_dword s2, s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 @@ -5623,8 +5623,8 @@ define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, ptr %out, i64 %index ; ; GCN2-LABEL: atomic_store_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GCN2-NEXT: s_load_dword s2, s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 @@ -5637,15 +5637,15 @@ define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, ptr %out, i64 %index ; ; GCN3-LABEL: atomic_store_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s8 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -5657,7 +5657,7 @@ entry: define amdgpu_kernel void @atomic_load_f32_offset(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_f32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -5673,7 +5673,7 @@ define amdgpu_kernel void @atomic_load_f32_offset(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_f32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -5689,7 +5689,7 @@ define amdgpu_kernel void @atomic_load_f32_offset(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_f32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 @@ -5710,7 +5710,7 @@ entry: define amdgpu_kernel void @atomic_load_f32(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_f32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -5724,7 +5724,7 @@ define amdgpu_kernel void @atomic_load_f32(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_f32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -5738,7 +5738,7 @@ define amdgpu_kernel void @atomic_load_f32(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_f32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 @@ -5758,8 +5758,8 @@ entry: define amdgpu_kernel void @atomic_load_f32_addr64_offset(ptr %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_load_f32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 ; GCN1-NEXT: s_add_u32 s0, s0, s4 @@ -5778,8 +5778,8 @@ define amdgpu_kernel void @atomic_load_f32_addr64_offset(ptr %in, ptr %out, i64 ; ; GCN2-LABEL: atomic_load_f32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 ; GCN2-NEXT: s_add_u32 s0, s0, s4 @@ -5798,10 +5798,10 @@ define amdgpu_kernel void @atomic_load_f32_addr64_offset(ptr %in, ptr %out, i64 ; ; GCN3-LABEL: atomic_load_f32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -5824,8 +5824,8 @@ entry: define amdgpu_kernel void @atomic_load_f32_addr64(ptr %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_load_f32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 ; GCN1-NEXT: s_add_u32 s0, s0, s4 @@ -5842,8 +5842,8 @@ define amdgpu_kernel void @atomic_load_f32_addr64(ptr %in, ptr %out, i64 %index) ; ; GCN2-LABEL: atomic_load_f32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 ; GCN2-NEXT: s_add_u32 s0, s0, s4 @@ -5860,10 +5860,10 @@ define amdgpu_kernel void @atomic_load_f32_addr64(ptr %in, ptr %out, i64 %index) ; ; GCN3-LABEL: atomic_load_f32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -5885,37 +5885,37 @@ entry: define amdgpu_kernel void @atomic_store_f32_offset(float %in, ptr %out) { ; GCN1-LABEL: atomic_store_f32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; GCN1-NEXT: s_load_dword s4, s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 16 +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_f32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 16 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 16 +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_store_f32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_store_dword v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm @@ -5928,33 +5928,33 @@ entry: define amdgpu_kernel void @atomic_store_f32(float %in, ptr %out) { ; GCN1-LABEL: atomic_store_f32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; GCN1-NEXT: s_load_dword s0, s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_f32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_store_f32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm @@ -5966,8 +5966,8 @@ entry: define amdgpu_kernel void @atomic_store_f32_addr64_offset(float %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_store_f32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GCN1-NEXT: s_load_dword s2, s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 @@ -5982,8 +5982,8 @@ define amdgpu_kernel void @atomic_store_f32_addr64_offset(float %in, ptr %out, i ; ; GCN2-LABEL: atomic_store_f32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GCN2-NEXT: s_load_dword s2, s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 @@ -5998,15 +5998,15 @@ define amdgpu_kernel void @atomic_store_f32_addr64_offset(float %in, ptr %out, i ; ; GCN3-LABEL: atomic_store_f32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s8 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_store_dword v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm entry: @@ -6019,8 +6019,8 @@ entry: define amdgpu_kernel void @atomic_store_f32_addr64(float %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_store_f32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GCN1-NEXT: s_load_dword s2, s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 @@ -6033,8 +6033,8 @@ define amdgpu_kernel void @atomic_store_f32_addr64(float %in, ptr %out, i64 %ind ; ; GCN2-LABEL: atomic_store_f32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GCN2-NEXT: s_load_dword s2, s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 @@ -6047,15 +6047,15 @@ define amdgpu_kernel void @atomic_store_f32_addr64(float %in, ptr %out, i64 %ind ; ; GCN3-LABEL: atomic_store_f32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s8 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -6067,7 +6067,7 @@ entry: define amdgpu_kernel void @atomic_load_i8_offset(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_i8_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -6083,7 +6083,7 @@ define amdgpu_kernel void @atomic_load_i8_offset(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_i8_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -6099,7 +6099,7 @@ define amdgpu_kernel void @atomic_load_i8_offset(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_i8_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 @@ -6120,7 +6120,7 @@ entry: define amdgpu_kernel void @atomic_load_i8(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_i8: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -6134,7 +6134,7 @@ define amdgpu_kernel void @atomic_load_i8(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_i8: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -6148,7 +6148,7 @@ define amdgpu_kernel void @atomic_load_i8(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_i8: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 @@ -6168,8 +6168,8 @@ entry: define amdgpu_kernel void @atomic_load_i8_addr64_offset(ptr %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_load_i8_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 @@ -6187,8 +6187,8 @@ define amdgpu_kernel void @atomic_load_i8_addr64_offset(ptr %in, ptr %out, i64 % ; ; GCN2-LABEL: atomic_load_i8_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 @@ -6206,11 +6206,11 @@ define amdgpu_kernel void @atomic_load_i8_addr64_offset(ptr %in, ptr %out, i64 % ; ; GCN3-LABEL: atomic_load_i8_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_add_u32 s0, s4, s2 +; GCN3-NEXT: s_addc_u32 s1, s5, s3 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_ubyte v2, v[0:1] offset:16 glc @@ -6231,37 +6231,37 @@ entry: define amdgpu_kernel void @atomic_store_i8_offset(i8 %in, ptr %out) { ; GCN1-LABEL: atomic_store_i8_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; GCN1-NEXT: s_load_dword s4, s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 16 +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_store_byte v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_i8_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 16 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 16 +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_store_byte v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_store_i8_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_store_byte v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm @@ -6274,33 +6274,33 @@ entry: define amdgpu_kernel void @atomic_store_i8(i8 %in, ptr %out) { ; GCN1-LABEL: atomic_store_i8: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; GCN1-NEXT: s_load_dword s0, s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_store_byte v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_i8: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_store_byte v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_store_i8: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_store_byte v[0:1], v2 ; GCN3-NEXT: s_endpgm @@ -6312,8 +6312,8 @@ entry: define amdgpu_kernel void @atomic_store_i8_addr64_offset(i8 %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_store_i8_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GCN1-NEXT: s_load_dword s2, s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, s6 ; GCN1-NEXT: s_addc_u32 s1, s5, s7 @@ -6327,8 +6327,8 @@ define amdgpu_kernel void @atomic_store_i8_addr64_offset(i8 %in, ptr %out, i64 % ; ; GCN2-LABEL: atomic_store_i8_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GCN2-NEXT: s_load_dword s2, s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, s6 ; GCN2-NEXT: s_addc_u32 s1, s5, s7 @@ -6342,14 +6342,14 @@ define amdgpu_kernel void @atomic_store_i8_addr64_offset(i8 %in, ptr %out, i64 % ; ; GCN3-LABEL: atomic_store_i8_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_add_u32 s0, s4, s6 ; GCN3-NEXT: s_addc_u32 s1, s5, s7 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s8 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_store_byte v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm entry: @@ -6362,7 +6362,7 @@ entry: define amdgpu_kernel void @atomic_load_i16_offset(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_i16_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -6378,7 +6378,7 @@ define amdgpu_kernel void @atomic_load_i16_offset(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_i16_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -6394,7 +6394,7 @@ define amdgpu_kernel void @atomic_load_i16_offset(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_i16_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 @@ -6415,7 +6415,7 @@ entry: define amdgpu_kernel void @atomic_load_i16(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_i16: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -6429,7 +6429,7 @@ define amdgpu_kernel void @atomic_load_i16(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_i16: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -6443,7 +6443,7 @@ define amdgpu_kernel void @atomic_load_i16(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_i16: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 @@ -6463,8 +6463,8 @@ entry: define amdgpu_kernel void @atomic_load_i16_addr64_offset(ptr %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_load_i16_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 1 ; GCN1-NEXT: s_add_u32 s0, s0, s4 @@ -6483,8 +6483,8 @@ define amdgpu_kernel void @atomic_load_i16_addr64_offset(ptr %in, ptr %out, i64 ; ; GCN2-LABEL: atomic_load_i16_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 1 ; GCN2-NEXT: s_add_u32 s0, s0, s4 @@ -6503,10 +6503,10 @@ define amdgpu_kernel void @atomic_load_i16_addr64_offset(ptr %in, ptr %out, i64 ; ; GCN3-LABEL: atomic_load_i16_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 1 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -6529,37 +6529,37 @@ entry: define amdgpu_kernel void @atomic_store_i16_offset(i16 %in, ptr %out) { ; GCN1-LABEL: atomic_store_i16_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; GCN1-NEXT: s_load_dword s4, s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 16 +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_store_short v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_i16_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 16 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 16 +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_store_short v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_store_i16_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_store_short v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm @@ -6572,33 +6572,33 @@ entry: define amdgpu_kernel void @atomic_store_i16(i16 %in, ptr %out) { ; GCN1-LABEL: atomic_store_i16: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; GCN1-NEXT: s_load_dword s0, s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_store_short v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_i16: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_store_short v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_store_i16: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_store_short v[0:1], v2 ; GCN3-NEXT: s_endpgm @@ -6610,8 +6610,8 @@ entry: define amdgpu_kernel void @atomic_store_i16_addr64_offset(i16 %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_store_i16_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GCN1-NEXT: s_load_dword s2, s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[0:1], s[6:7], 1 ; GCN1-NEXT: s_add_u32 s0, s4, s0 @@ -6626,8 +6626,8 @@ define amdgpu_kernel void @atomic_store_i16_addr64_offset(i16 %in, ptr %out, i64 ; ; GCN2-LABEL: atomic_store_i16_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GCN2-NEXT: s_load_dword s2, s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[0:1], s[6:7], 1 ; GCN2-NEXT: s_add_u32 s0, s4, s0 @@ -6642,15 +6642,15 @@ define amdgpu_kernel void @atomic_store_i16_addr64_offset(i16 %in, ptr %out, i64 ; ; GCN3-LABEL: atomic_store_i16_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[0:1], s[6:7], 1 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s8 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_store_short v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm entry: @@ -6663,37 +6663,37 @@ entry: define amdgpu_kernel void @atomic_store_f16_offset(half %in, ptr %out) { ; GCN1-LABEL: atomic_store_f16_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; GCN1-NEXT: s_load_dword s4, s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 16 +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_store_short v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_f16_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 16 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 16 +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_store_short v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_store_f16_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_store_short v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm @@ -6706,33 +6706,33 @@ entry: define amdgpu_kernel void @atomic_store_f16(half %in, ptr %out) { ; GCN1-LABEL: atomic_store_f16: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; GCN1-NEXT: s_load_dword s0, s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_store_short v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_f16: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_store_short v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_store_f16: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_store_short v[0:1], v2 ; GCN3-NEXT: s_endpgm @@ -6744,33 +6744,33 @@ entry: define amdgpu_kernel void @atomic_store_bf16_offset(bfloat %in, ptr %out) { ; GCN1-LABEL: atomic_store_bf16_offset: ; GCN1: ; %bb.0: -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; GCN1-NEXT: s_load_dword s0, s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_store_short v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_bf16_offset: ; GCN2: ; %bb.0: -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_store_short v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_store_bf16_offset: ; GCN3: ; %bb.0: -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_store_short v[0:1], v2 ; GCN3-NEXT: s_endpgm @@ -6782,33 +6782,33 @@ define amdgpu_kernel void @atomic_store_bf16_offset(bfloat %in, ptr %out) { define amdgpu_kernel void @atomic_store_bf16(bfloat %in, ptr %out) { ; GCN1-LABEL: atomic_store_bf16: ; GCN1: ; %bb.0: -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; GCN1-NEXT: s_load_dword s0, s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_store_short v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_bf16: ; GCN2: ; %bb.0: -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_store_short v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_store_bf16: ; GCN3: ; %bb.0: -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_store_short v[0:1], v2 ; GCN3-NEXT: s_endpgm @@ -6819,14 +6819,14 @@ define amdgpu_kernel void @atomic_store_bf16(bfloat %in, ptr %out) { define amdgpu_kernel void @atomic_inc_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_inc_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 16 +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_inc v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -6834,14 +6834,14 @@ define amdgpu_kernel void @atomic_inc_i32_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_inc_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 16 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 16 +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_inc v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -6849,11 +6849,11 @@ define amdgpu_kernel void @atomic_inc_i32_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_inc_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_inc v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6868,14 +6868,14 @@ entry: define amdgpu_kernel void @atomic_inc_i32_max_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_inc_i32_max_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 0xffc -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 0xffc +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_inc v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -6883,14 +6883,14 @@ define amdgpu_kernel void @atomic_inc_i32_max_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_inc_i32_max_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 0xffc -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 0xffc +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_inc v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -6898,11 +6898,11 @@ define amdgpu_kernel void @atomic_inc_i32_max_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_inc_i32_max_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_inc v[0:1], v2 offset:4092 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6917,14 +6917,14 @@ entry: define amdgpu_kernel void @atomic_inc_i32_max_offset_p1(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_inc_i32_max_offset_p1: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 0x1000 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 0x1000 +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_inc v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -6932,14 +6932,14 @@ define amdgpu_kernel void @atomic_inc_i32_max_offset_p1(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_inc_i32_max_offset_p1: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 0x1000 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 0x1000 +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_inc v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -6947,11 +6947,11 @@ define amdgpu_kernel void @atomic_inc_i32_max_offset_p1(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_inc_i32_max_offset_p1: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: v_mov_b32_e32 v2, s4 @@ -6968,8 +6968,8 @@ entry: define amdgpu_kernel void @atomic_inc_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_inc_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, 16 ; GCN1-NEXT: s_addc_u32 s1, s5, 0 @@ -6986,8 +6986,8 @@ define amdgpu_kernel void @atomic_inc_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_inc_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, 16 ; GCN2-NEXT: s_addc_u32 s1, s5, 0 @@ -7004,12 +7004,12 @@ define amdgpu_kernel void @atomic_inc_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_inc_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_inc v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -7027,18 +7027,18 @@ entry: define amdgpu_kernel void @atomic_inc_i32_incr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_inc_i32_incr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_inc v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7046,18 +7046,18 @@ define amdgpu_kernel void @atomic_inc_i32_incr64_offset(ptr %out, i32 %in, i64 % ; ; GCN2-LABEL: atomic_inc_i32_incr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_inc v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7065,11 +7065,11 @@ define amdgpu_kernel void @atomic_inc_i32_incr64_offset(ptr %out, i32 %in, i64 % ; ; GCN3-LABEL: atomic_inc_i32_incr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -7089,18 +7089,18 @@ entry: define amdgpu_kernel void @atomic_inc_i32_ret_incr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_inc_i32_ret_incr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_inc v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7111,18 +7111,18 @@ define amdgpu_kernel void @atomic_inc_i32_ret_incr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_inc_i32_ret_incr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_inc v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7133,11 +7133,11 @@ define amdgpu_kernel void @atomic_inc_i32_ret_incr64_offset(ptr %out, ptr %out2, ; ; GCN3-LABEL: atomic_inc_i32_ret_incr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -7161,12 +7161,12 @@ entry: define amdgpu_kernel void @atomic_inc_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_inc_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_inc v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7174,12 +7174,12 @@ define amdgpu_kernel void @atomic_inc_i32(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_inc_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_inc v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7187,11 +7187,11 @@ define amdgpu_kernel void @atomic_inc_i32(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_inc_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_inc v[0:1], v2 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7205,8 +7205,8 @@ entry: define amdgpu_kernel void @atomic_inc_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_inc_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -7221,8 +7221,8 @@ define amdgpu_kernel void @atomic_inc_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN2-LABEL: atomic_inc_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -7237,12 +7237,12 @@ define amdgpu_kernel void @atomic_inc_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN3-LABEL: atomic_inc_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_inc v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -7259,16 +7259,16 @@ entry: define amdgpu_kernel void @atomic_inc_i32_incr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_inc_i32_incr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_inc v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7276,16 +7276,16 @@ define amdgpu_kernel void @atomic_inc_i32_incr64(ptr %out, i32 %in, i64 %index) ; ; GCN2-LABEL: atomic_inc_i32_incr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_inc v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7293,11 +7293,11 @@ define amdgpu_kernel void @atomic_inc_i32_incr64(ptr %out, i32 %in, i64 %index) ; ; GCN3-LABEL: atomic_inc_i32_incr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -7316,16 +7316,16 @@ entry: define amdgpu_kernel void @atomic_inc_i32_ret_incr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_inc_i32_ret_incr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_inc v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7336,16 +7336,16 @@ define amdgpu_kernel void @atomic_inc_i32_ret_incr64(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_inc_i32_ret_incr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_inc v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7356,11 +7356,11 @@ define amdgpu_kernel void @atomic_inc_i32_ret_incr64(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_inc_i32_ret_incr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -7383,14 +7383,14 @@ entry: define amdgpu_kernel void @atomic_dec_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_dec_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 16 +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_dec v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7398,14 +7398,14 @@ define amdgpu_kernel void @atomic_dec_i32_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_dec_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 16 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 16 +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_dec v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7413,11 +7413,11 @@ define amdgpu_kernel void @atomic_dec_i32_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_dec_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_dec v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7432,14 +7432,14 @@ entry: define amdgpu_kernel void @atomic_dec_i32_max_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_dec_i32_max_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 0xffc -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 0xffc +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_dec v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7447,14 +7447,14 @@ define amdgpu_kernel void @atomic_dec_i32_max_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_dec_i32_max_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 0xffc -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 0xffc +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_dec v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7462,11 +7462,11 @@ define amdgpu_kernel void @atomic_dec_i32_max_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_dec_i32_max_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_dec v[0:1], v2 offset:4092 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7481,14 +7481,14 @@ entry: define amdgpu_kernel void @atomic_dec_i32_max_offset_p1(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_dec_i32_max_offset_p1: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 0x1000 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 0x1000 +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_dec v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7496,14 +7496,14 @@ define amdgpu_kernel void @atomic_dec_i32_max_offset_p1(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_dec_i32_max_offset_p1: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 0x1000 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 0x1000 +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_dec v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7511,11 +7511,11 @@ define amdgpu_kernel void @atomic_dec_i32_max_offset_p1(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_dec_i32_max_offset_p1: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: v_mov_b32_e32 v2, s4 @@ -7532,8 +7532,8 @@ entry: define amdgpu_kernel void @atomic_dec_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_dec_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, 16 ; GCN1-NEXT: s_addc_u32 s1, s5, 0 @@ -7550,8 +7550,8 @@ define amdgpu_kernel void @atomic_dec_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_dec_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, 16 ; GCN2-NEXT: s_addc_u32 s1, s5, 0 @@ -7568,12 +7568,12 @@ define amdgpu_kernel void @atomic_dec_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_dec_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_dec v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -7591,18 +7591,18 @@ entry: define amdgpu_kernel void @atomic_dec_i32_decr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_dec_i32_decr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_dec v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7610,18 +7610,18 @@ define amdgpu_kernel void @atomic_dec_i32_decr64_offset(ptr %out, i32 %in, i64 % ; ; GCN2-LABEL: atomic_dec_i32_decr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_dec v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7629,11 +7629,11 @@ define amdgpu_kernel void @atomic_dec_i32_decr64_offset(ptr %out, i32 %in, i64 % ; ; GCN3-LABEL: atomic_dec_i32_decr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -7653,18 +7653,18 @@ entry: define amdgpu_kernel void @atomic_dec_i32_ret_decr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_dec_i32_ret_decr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_dec v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7675,18 +7675,18 @@ define amdgpu_kernel void @atomic_dec_i32_ret_decr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_dec_i32_ret_decr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_dec v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7697,11 +7697,11 @@ define amdgpu_kernel void @atomic_dec_i32_ret_decr64_offset(ptr %out, ptr %out2, ; ; GCN3-LABEL: atomic_dec_i32_ret_decr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -7725,12 +7725,12 @@ entry: define amdgpu_kernel void @atomic_dec_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_dec_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_dec v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7738,12 +7738,12 @@ define amdgpu_kernel void @atomic_dec_i32(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_dec_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_dec v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7751,11 +7751,11 @@ define amdgpu_kernel void @atomic_dec_i32(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_dec_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_dec v[0:1], v2 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7769,8 +7769,8 @@ entry: define amdgpu_kernel void @atomic_dec_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_dec_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -7785,8 +7785,8 @@ define amdgpu_kernel void @atomic_dec_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN2-LABEL: atomic_dec_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -7801,12 +7801,12 @@ define amdgpu_kernel void @atomic_dec_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN3-LABEL: atomic_dec_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_dec v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -7823,16 +7823,16 @@ entry: define amdgpu_kernel void @atomic_dec_i32_decr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_dec_i32_decr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_dec v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7840,16 +7840,16 @@ define amdgpu_kernel void @atomic_dec_i32_decr64(ptr %out, i32 %in, i64 %index) ; ; GCN2-LABEL: atomic_dec_i32_decr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_dec v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7857,11 +7857,11 @@ define amdgpu_kernel void @atomic_dec_i32_decr64(ptr %out, i32 %in, i64 %index) ; ; GCN3-LABEL: atomic_dec_i32_decr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -7880,16 +7880,16 @@ entry: define amdgpu_kernel void @atomic_dec_i32_ret_decr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_dec_i32_ret_decr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_dec v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7900,16 +7900,16 @@ define amdgpu_kernel void @atomic_dec_i32_ret_decr64(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_dec_i32_ret_decr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_dec v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7920,11 +7920,11 @@ define amdgpu_kernel void @atomic_dec_i32_ret_decr64(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_dec_i32_ret_decr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -7947,7 +7947,7 @@ entry: define amdgpu_kernel void @atomic_load_f16_offset(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_f16_offset: ; GCN1: ; %bb.0: -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -7963,7 +7963,7 @@ define amdgpu_kernel void @atomic_load_f16_offset(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_f16_offset: ; GCN2: ; %bb.0: -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -7979,7 +7979,7 @@ define amdgpu_kernel void @atomic_load_f16_offset(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_f16_offset: ; GCN3: ; %bb.0: -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 @@ -7999,7 +7999,7 @@ define amdgpu_kernel void @atomic_load_f16_offset(ptr %in, ptr %out) { define amdgpu_kernel void @atomic_load_f16(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_f16: ; GCN1: ; %bb.0: -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -8013,7 +8013,7 @@ define amdgpu_kernel void @atomic_load_f16(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_f16: ; GCN2: ; %bb.0: -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -8027,7 +8027,7 @@ define amdgpu_kernel void @atomic_load_f16(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_f16: ; GCN3: ; %bb.0: -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 @@ -8046,7 +8046,7 @@ define amdgpu_kernel void @atomic_load_f16(ptr %in, ptr %out) { define amdgpu_kernel void @atomic_load_bf16_offset(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_bf16_offset: ; GCN1: ; %bb.0: -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -8062,7 +8062,7 @@ define amdgpu_kernel void @atomic_load_bf16_offset(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_bf16_offset: ; GCN2: ; %bb.0: -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -8078,7 +8078,7 @@ define amdgpu_kernel void @atomic_load_bf16_offset(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_bf16_offset: ; GCN3: ; %bb.0: -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 @@ -8098,7 +8098,7 @@ define amdgpu_kernel void @atomic_load_bf16_offset(ptr %in, ptr %out) { define amdgpu_kernel void @atomic_load_bf16(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_bf16: ; GCN1: ; %bb.0: -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -8112,7 +8112,7 @@ define amdgpu_kernel void @atomic_load_bf16(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_bf16: ; GCN2: ; %bb.0: -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -8126,7 +8126,7 @@ define amdgpu_kernel void @atomic_load_bf16(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_bf16: ; GCN3: ; %bb.0: -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll index 4d80e9124f41f..66aacd7062a6d 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll @@ -3823,7 +3823,7 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_offset_scalar(ptr inreg %out, i32 define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_max_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_ashr_i32 s5, s3, 31 ; GCN1-NEXT: s_mov_b32 s4, s3 @@ -3853,7 +3853,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 % ; ; GCN2-LABEL: atomic_max_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_ashr_i32 s5, s3, 31 ; GCN2-NEXT: s_mov_b32 s4, s3 @@ -3883,13 +3883,13 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 % ; ; GCN3-LABEL: atomic_max_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_ashr_i32 s5, s3, 31 -; GCN3-NEXT: s_mov_b32 s4, s3 -; GCN3-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; GCN3-NEXT: s_add_u32 s0, s0, s4 -; GCN3-NEXT: s_addc_u32 s1, s1, s5 +; GCN3-NEXT: s_ashr_i32 s1, s7, 31 +; GCN3-NEXT: s_mov_b32 s0, s7 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_add_u32 s0, s4, s0 +; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 @@ -3897,7 +3897,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN3-NEXT: .LBB88_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_max_i32_e32 v2, s2, v3 +; GCN3-NEXT: v_max_i32_e32 v2, s6, v3 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -3918,8 +3918,8 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_max_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_ashr_i32 s7, s5, 31 ; GCN1-NEXT: s_mov_b32 s6, s5 @@ -3953,8 +3953,8 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_max_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_ashr_i32 s7, s5, 31 ; GCN2-NEXT: s_mov_b32 s6, s5 @@ -3988,32 +3988,32 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN3-LABEL: atomic_max_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_ashr_i32 s3, s1, 31 -; GCN3-NEXT: s_mov_b32 s2, s1 -; GCN3-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 -; GCN3-NEXT: s_add_u32 s2, s4, s2 -; GCN3-NEXT: s_addc_u32 s3, s5, s3 -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: s_ashr_i32 s1, s3, 31 +; GCN3-NEXT: s_mov_b32 s0, s3 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_add_u32 s0, s4, s0 +; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 -; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: .LBB89_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: v_max_i32_e32 v2, s0, v3 +; GCN3-NEXT: v_max_i32_e32 v2, s2, v3 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GCN3-NEXT: s_cbranch_execnz .LBB89_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[2:3] +; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: flat_store_dword v[0:1], v2 @@ -4029,7 +4029,7 @@ entry: define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_max_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_ashr_i32 s5, s3, 31 ; GCN1-NEXT: s_mov_b32 s4, s3 @@ -4057,7 +4057,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index) ; ; GCN2-LABEL: atomic_max_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_ashr_i32 s5, s3, 31 ; GCN2-NEXT: s_mov_b32 s4, s3 @@ -4085,13 +4085,13 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index) ; ; GCN3-LABEL: atomic_max_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_ashr_i32 s5, s3, 31 -; GCN3-NEXT: s_mov_b32 s4, s3 -; GCN3-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; GCN3-NEXT: s_add_u32 s0, s0, s4 -; GCN3-NEXT: s_addc_u32 s1, s1, s5 +; GCN3-NEXT: s_ashr_i32 s1, s7, 31 +; GCN3-NEXT: s_mov_b32 s0, s7 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_add_u32 s0, s4, s0 +; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v3, v[0:1] @@ -4099,7 +4099,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index) ; GCN3-NEXT: .LBB90_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_max_i32_e32 v2, s2, v3 +; GCN3-NEXT: v_max_i32_e32 v2, s6, v3 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -4119,8 +4119,8 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_max_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_ashr_i32 s7, s5, 31 ; GCN1-NEXT: s_mov_b32 s6, s5 @@ -4152,8 +4152,8 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_max_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_ashr_i32 s7, s5, 31 ; GCN2-NEXT: s_mov_b32 s6, s5 @@ -4185,32 +4185,32 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_max_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_ashr_i32 s3, s1, 31 -; GCN3-NEXT: s_mov_b32 s2, s1 -; GCN3-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 -; GCN3-NEXT: s_add_u32 s2, s4, s2 -; GCN3-NEXT: s_addc_u32 s3, s5, s3 -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: s_ashr_i32 s1, s3, 31 +; GCN3-NEXT: s_mov_b32 s0, s3 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_add_u32 s0, s4, s0 +; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v2, v[0:1] -; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: .LBB91_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: v_max_i32_e32 v2, s0, v3 +; GCN3-NEXT: v_max_i32_e32 v2, s2, v3 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GCN3-NEXT: s_cbranch_execnz .LBB91_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[2:3] +; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: flat_store_dword v[0:1], v2 @@ -4966,7 +4966,7 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_offset_scalar(ptr inreg %out, i3 define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_umax_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_ashr_i32 s5, s3, 31 ; GCN1-NEXT: s_mov_b32 s4, s3 @@ -4996,7 +4996,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32 ; ; GCN2-LABEL: atomic_umax_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_ashr_i32 s5, s3, 31 ; GCN2-NEXT: s_mov_b32 s4, s3 @@ -5026,13 +5026,13 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32 ; ; GCN3-LABEL: atomic_umax_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_ashr_i32 s5, s3, 31 -; GCN3-NEXT: s_mov_b32 s4, s3 -; GCN3-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; GCN3-NEXT: s_add_u32 s0, s0, s4 -; GCN3-NEXT: s_addc_u32 s1, s1, s5 +; GCN3-NEXT: s_ashr_i32 s1, s7, 31 +; GCN3-NEXT: s_mov_b32 s0, s7 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_add_u32 s0, s4, s0 +; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 @@ -5040,7 +5040,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32 ; GCN3-NEXT: .LBB102_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_max_u32_e32 v2, s2, v3 +; GCN3-NEXT: v_max_u32_e32 v2, s6, v3 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -5061,8 +5061,8 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_umax_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_ashr_i32 s7, s5, 31 ; GCN1-NEXT: s_mov_b32 s6, s5 @@ -5096,8 +5096,8 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2 ; ; GCN2-LABEL: atomic_umax_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_ashr_i32 s7, s5, 31 ; GCN2-NEXT: s_mov_b32 s6, s5 @@ -5131,32 +5131,32 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2 ; ; GCN3-LABEL: atomic_umax_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_ashr_i32 s3, s1, 31 -; GCN3-NEXT: s_mov_b32 s2, s1 -; GCN3-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 -; GCN3-NEXT: s_add_u32 s2, s4, s2 -; GCN3-NEXT: s_addc_u32 s3, s5, s3 -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: s_ashr_i32 s1, s3, 31 +; GCN3-NEXT: s_mov_b32 s0, s3 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_add_u32 s0, s4, s0 +; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 -; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: .LBB103_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: v_max_u32_e32 v2, s0, v3 +; GCN3-NEXT: v_max_u32_e32 v2, s2, v3 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GCN3-NEXT: s_cbranch_execnz .LBB103_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[2:3] +; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: flat_store_dword v[0:1], v2 @@ -5172,8 +5172,8 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_umax_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_ashr_i32 s7, s5, 31 ; GCN1-NEXT: s_mov_b32 s6, s5 @@ -5205,8 +5205,8 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; ; GCN2-LABEL: atomic_umax_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_ashr_i32 s7, s5, 31 ; GCN2-NEXT: s_mov_b32 s6, s5 @@ -5238,32 +5238,32 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; ; GCN3-LABEL: atomic_umax_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_ashr_i32 s3, s1, 31 -; GCN3-NEXT: s_mov_b32 s2, s1 -; GCN3-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 -; GCN3-NEXT: s_add_u32 s2, s4, s2 -; GCN3-NEXT: s_addc_u32 s3, s5, s3 -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: s_ashr_i32 s1, s3, 31 +; GCN3-NEXT: s_mov_b32 s0, s3 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_add_u32 s0, s4, s0 +; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v2, v[0:1] -; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: .LBB104_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: v_max_u32_e32 v2, s0, v3 +; GCN3-NEXT: v_max_u32_e32 v2, s2, v3 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GCN3-NEXT: s_cbranch_execnz .LBB104_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[2:3] +; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: flat_store_dword v[0:1], v2 @@ -6760,7 +6760,7 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_offset_scalar(ptr inreg %out, i32 define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_min_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_ashr_i32 s5, s3, 31 ; GCN1-NEXT: s_mov_b32 s4, s3 @@ -6790,7 +6790,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 % ; ; GCN2-LABEL: atomic_min_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_ashr_i32 s5, s3, 31 ; GCN2-NEXT: s_mov_b32 s4, s3 @@ -6820,13 +6820,13 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 % ; ; GCN3-LABEL: atomic_min_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_ashr_i32 s5, s3, 31 -; GCN3-NEXT: s_mov_b32 s4, s3 -; GCN3-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; GCN3-NEXT: s_add_u32 s0, s0, s4 -; GCN3-NEXT: s_addc_u32 s1, s1, s5 +; GCN3-NEXT: s_ashr_i32 s1, s7, 31 +; GCN3-NEXT: s_mov_b32 s0, s7 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_add_u32 s0, s4, s0 +; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 @@ -6834,7 +6834,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN3-NEXT: .LBB125_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_min_i32_e32 v2, s2, v3 +; GCN3-NEXT: v_min_i32_e32 v2, s6, v3 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -6855,8 +6855,8 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_min_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_ashr_i32 s7, s5, 31 ; GCN1-NEXT: s_mov_b32 s6, s5 @@ -6890,8 +6890,8 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_min_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_ashr_i32 s7, s5, 31 ; GCN2-NEXT: s_mov_b32 s6, s5 @@ -6925,32 +6925,32 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN3-LABEL: atomic_min_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_ashr_i32 s3, s1, 31 -; GCN3-NEXT: s_mov_b32 s2, s1 -; GCN3-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 -; GCN3-NEXT: s_add_u32 s2, s4, s2 -; GCN3-NEXT: s_addc_u32 s3, s5, s3 -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: s_ashr_i32 s1, s3, 31 +; GCN3-NEXT: s_mov_b32 s0, s3 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_add_u32 s0, s4, s0 +; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 -; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: .LBB126_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: v_min_i32_e32 v2, s0, v3 +; GCN3-NEXT: v_min_i32_e32 v2, s2, v3 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GCN3-NEXT: s_cbranch_execnz .LBB126_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[2:3] +; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: flat_store_dword v[0:1], v2 @@ -6966,8 +6966,8 @@ entry: define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_min_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s2, s[0:1], 0xb ; GCN1-NEXT: s_mov_b64 s[0:1], 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 @@ -6990,8 +6990,8 @@ define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_min_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s2, s[0:1], 0x2c ; GCN2-NEXT: s_mov_b64 s[0:1], 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 @@ -7014,17 +7014,17 @@ define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_min_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x2c ; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v3, v[0:1] ; GCN3-NEXT: .LBB127_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_min_i32_e32 v2, s4, v3 +; GCN3-NEXT: v_min_i32_e32 v2, s2, v3 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -7043,8 +7043,8 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_min_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_ashr_i32 s7, s5, 31 ; GCN1-NEXT: s_mov_b32 s6, s5 @@ -7076,8 +7076,8 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_min_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_ashr_i32 s7, s5, 31 ; GCN2-NEXT: s_mov_b32 s6, s5 @@ -7109,32 +7109,32 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_min_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_ashr_i32 s3, s1, 31 -; GCN3-NEXT: s_mov_b32 s2, s1 -; GCN3-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 -; GCN3-NEXT: s_add_u32 s2, s4, s2 -; GCN3-NEXT: s_addc_u32 s3, s5, s3 -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: s_ashr_i32 s1, s3, 31 +; GCN3-NEXT: s_mov_b32 s0, s3 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_add_u32 s0, s4, s0 +; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v2, v[0:1] -; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: .LBB128_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: v_min_i32_e32 v2, s0, v3 +; GCN3-NEXT: v_min_i32_e32 v2, s2, v3 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GCN3-NEXT: s_cbranch_execnz .LBB128_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[2:3] +; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: flat_store_dword v[0:1], v2 diff --git a/llvm/test/CodeGen/AMDGPU/fma-combine.ll b/llvm/test/CodeGen/AMDGPU/fma-combine.ll index 4846e21fe836e..bac2d8b8b40c2 100644 --- a/llvm/test/CodeGen/AMDGPU/fma-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/fma-combine.ll @@ -21,7 +21,7 @@ declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) #0 define amdgpu_kernel void @combine_to_fma_f64_0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-LABEL: combine_to_fma_f64_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -41,9 +41,7 @@ define amdgpu_kernel void @combine_to_fma_f64_0(ptr addrspace(1) noalias %out, p ; ; GFX11-LABEL: combine_to_fma_f64_0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3] glc dlc @@ -77,7 +75,7 @@ define amdgpu_kernel void @combine_to_fma_f64_0(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @combine_to_fma_f64_0_2use(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-LABEL: combine_to_fma_f64_0_2use: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -103,9 +101,7 @@ define amdgpu_kernel void @combine_to_fma_f64_0_2use(ptr addrspace(1) noalias %o ; ; GFX11-LABEL: combine_to_fma_f64_0_2use: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v8, s[2:3] glc dlc @@ -150,7 +146,7 @@ define amdgpu_kernel void @combine_to_fma_f64_0_2use(ptr addrspace(1) noalias %o define amdgpu_kernel void @combine_to_fma_f64_1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-LABEL: combine_to_fma_f64_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -170,9 +166,7 @@ define amdgpu_kernel void @combine_to_fma_f64_1(ptr addrspace(1) noalias %out, p ; ; GFX11-LABEL: combine_to_fma_f64_1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3] glc dlc @@ -206,7 +200,7 @@ define amdgpu_kernel void @combine_to_fma_f64_1(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @combine_to_fma_fsub_0_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-LABEL: combine_to_fma_fsub_0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -226,9 +220,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_0_f64(ptr addrspace(1) noalias %o ; ; GFX11-LABEL: combine_to_fma_fsub_0_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3] glc dlc @@ -262,7 +254,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_0_f64(ptr addrspace(1) noalias %o define amdgpu_kernel void @combine_to_fma_fsub_f64_0_2use(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-LABEL: combine_to_fma_fsub_f64_0_2use: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -288,9 +280,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_f64_0_2use(ptr addrspace(1) noali ; ; GFX11-LABEL: combine_to_fma_fsub_f64_0_2use: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v8, s[2:3] glc dlc @@ -335,7 +325,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_f64_0_2use(ptr addrspace(1) noali define amdgpu_kernel void @combine_to_fma_fsub_1_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-LABEL: combine_to_fma_fsub_1_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -355,9 +345,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_1_f64(ptr addrspace(1) noalias %o ; ; GFX11-LABEL: combine_to_fma_fsub_1_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3] glc dlc @@ -391,7 +379,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_1_f64(ptr addrspace(1) noalias %o define amdgpu_kernel void @combine_to_fma_fsub_1_f64_2use(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-LABEL: combine_to_fma_fsub_1_f64_2use: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -417,9 +405,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_1_f64_2use(ptr addrspace(1) noali ; ; GFX11-LABEL: combine_to_fma_fsub_1_f64_2use: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v8, s[2:3] glc dlc @@ -464,7 +450,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_1_f64_2use(ptr addrspace(1) noali define amdgpu_kernel void @combine_to_fma_fsub_2_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-LABEL: combine_to_fma_fsub_2_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -484,9 +470,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64(ptr addrspace(1) noalias %o ; ; GFX11-LABEL: combine_to_fma_fsub_2_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3] glc dlc @@ -522,7 +506,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64(ptr addrspace(1) noalias %o define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_neg(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-LABEL: combine_to_fma_fsub_2_f64_2uses_neg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -548,9 +532,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_neg(ptr addrspace(1) ; ; GFX11-LABEL: combine_to_fma_fsub_2_f64_2uses_neg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v8, s[2:3] glc dlc @@ -597,7 +579,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_neg(ptr addrspace(1) define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_mul(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-LABEL: combine_to_fma_fsub_2_f64_2uses_mul: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -623,9 +605,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_mul(ptr addrspace(1) ; ; GFX11-LABEL: combine_to_fma_fsub_2_f64_2uses_mul: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v8, s[2:3] glc dlc @@ -672,7 +652,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_mul(ptr addrspace(1) define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-NOFMA-LABEL: aggressive_combine_to_fma_fsub_0_f64: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NOFMA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOFMA-NEXT: s_mov_b32 s6, 0 ; SI-NOFMA-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -698,7 +678,7 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(ptr addrspace(1) ; ; SI-FMA-LABEL: aggressive_combine_to_fma_fsub_0_f64: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-FMA-NEXT: s_mov_b32 s7, 0xf000 ; SI-FMA-NEXT: s_mov_b32 s6, 0 ; SI-FMA-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -723,9 +703,7 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(ptr addrspace(1) ; ; GFX11-NOFMA-LABEL: aggressive_combine_to_fma_fsub_0_f64: ; GFX11-NOFMA: ; %bb.0: -; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NOFMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NOFMA-NEXT: v_lshlrev_b32_e32 v10, 3, v0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: global_load_b64 v[0:1], v10, s[2:3] glc dlc @@ -749,9 +727,7 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(ptr addrspace(1) ; ; GFX11-FMA-LABEL: aggressive_combine_to_fma_fsub_0_f64: ; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v10, 3, v0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: global_load_b64 v[0:1], v10, s[2:3] glc dlc @@ -798,7 +774,7 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(ptr addrspace(1) define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-NOFMA-LABEL: aggressive_combine_to_fma_fsub_1_f64: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NOFMA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOFMA-NEXT: s_mov_b32 s6, 0 ; SI-NOFMA-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -824,7 +800,7 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(ptr addrspace(1) ; ; SI-FMA-LABEL: aggressive_combine_to_fma_fsub_1_f64: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-FMA-NEXT: s_mov_b32 s7, 0xf000 ; SI-FMA-NEXT: s_mov_b32 s6, 0 ; SI-FMA-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -849,9 +825,7 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(ptr addrspace(1) ; ; GFX11-NOFMA-LABEL: aggressive_combine_to_fma_fsub_1_f64: ; GFX11-NOFMA: ; %bb.0: -; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NOFMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NOFMA-NEXT: v_lshlrev_b32_e32 v10, 3, v0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: global_load_b64 v[0:1], v10, s[2:3] glc dlc @@ -875,9 +849,7 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(ptr addrspace(1) ; ; GFX11-FMA-LABEL: aggressive_combine_to_fma_fsub_1_f64: ; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v10, 3, v0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: global_load_b64 v[0:1], v10, s[2:3] glc dlc @@ -927,56 +899,56 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(ptr addrspace(1) define amdgpu_kernel void @test_f32_mul_add_x_one_y(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_add_x_one_y: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s10, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s10 +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s2, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s2 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-NOFMA-NEXT: s_mov_b32 s12, s6 ; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s11 -; SI-NOFMA-NEXT: s_mov_b32 s2, s10 -; SI-NOFMA-NEXT: s_mov_b32 s3, s11 +; SI-NOFMA-NEXT: s_mov_b32 s15, s3 +; SI-NOFMA-NEXT: s_mov_b32 s10, s2 +; SI-NOFMA-NEXT: s_mov_b32 s11, s3 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 glc ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) -; SI-NOFMA-NEXT: s_mov_b32 s8, s4 -; SI-NOFMA-NEXT: s_mov_b32 s9, s5 +; SI-NOFMA-NEXT: s_mov_b32 s0, s4 +; SI-NOFMA-NEXT: s_mov_b32 s1, s5 ; SI-NOFMA-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_add_x_one_y: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s10, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s10 +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s2, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s2 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-FMA-NEXT: s_mov_b32 s12, s6 ; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s11 -; SI-FMA-NEXT: s_mov_b32 s2, s10 -; SI-FMA-NEXT: s_mov_b32 s3, s11 +; SI-FMA-NEXT: s_mov_b32 s15, s3 +; SI-FMA-NEXT: s_mov_b32 s10, s2 +; SI-FMA-NEXT: s_mov_b32 s11, s3 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc ; SI-FMA-NEXT: s_waitcnt vmcnt(0) -; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc +; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 glc ; SI-FMA-NEXT: s_waitcnt vmcnt(0) -; SI-FMA-NEXT: s_mov_b32 s8, s4 -; SI-FMA-NEXT: s_mov_b32 s9, s5 +; SI-FMA-NEXT: s_mov_b32 s0, s4 +; SI-FMA-NEXT: s_mov_b32 s1, s5 ; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_add_x_one_y: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -994,8 +966,8 @@ define amdgpu_kernel void @test_f32_mul_add_x_one_y(ptr addrspace(1) %out, ; GFX11-FMA-LABEL: test_f32_mul_add_x_one_y: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -1020,56 +992,56 @@ define amdgpu_kernel void @test_f32_mul_add_x_one_y(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_mul_y_add_x_one(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_y_add_x_one: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s10, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s10 +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s2, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s2 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-NOFMA-NEXT: s_mov_b32 s12, s6 ; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s11 -; SI-NOFMA-NEXT: s_mov_b32 s2, s10 -; SI-NOFMA-NEXT: s_mov_b32 s3, s11 +; SI-NOFMA-NEXT: s_mov_b32 s15, s3 +; SI-NOFMA-NEXT: s_mov_b32 s10, s2 +; SI-NOFMA-NEXT: s_mov_b32 s11, s3 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 glc ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) -; SI-NOFMA-NEXT: s_mov_b32 s8, s4 -; SI-NOFMA-NEXT: s_mov_b32 s9, s5 +; SI-NOFMA-NEXT: s_mov_b32 s0, s4 +; SI-NOFMA-NEXT: s_mov_b32 s1, s5 ; SI-NOFMA-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_y_add_x_one: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s10, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s10 +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s2, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s2 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-FMA-NEXT: s_mov_b32 s12, s6 ; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s11 -; SI-FMA-NEXT: s_mov_b32 s2, s10 -; SI-FMA-NEXT: s_mov_b32 s3, s11 +; SI-FMA-NEXT: s_mov_b32 s15, s3 +; SI-FMA-NEXT: s_mov_b32 s10, s2 +; SI-FMA-NEXT: s_mov_b32 s11, s3 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc ; SI-FMA-NEXT: s_waitcnt vmcnt(0) -; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc +; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 glc ; SI-FMA-NEXT: s_waitcnt vmcnt(0) -; SI-FMA-NEXT: s_mov_b32 s8, s4 -; SI-FMA-NEXT: s_mov_b32 s9, s5 +; SI-FMA-NEXT: s_mov_b32 s0, s4 +; SI-FMA-NEXT: s_mov_b32 s1, s5 ; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_y_add_x_one: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -1087,8 +1059,8 @@ define amdgpu_kernel void @test_f32_mul_y_add_x_one(ptr addrspace(1) %out, ; GFX11-FMA-LABEL: test_f32_mul_y_add_x_one: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -1113,55 +1085,55 @@ define amdgpu_kernel void @test_f32_mul_y_add_x_one(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_mul_add_x_negone_y(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_add_x_negone_y: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s10, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s10 +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s2, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s2 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-NOFMA-NEXT: s_mov_b32 s12, s6 ; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s11 -; SI-NOFMA-NEXT: s_mov_b32 s2, s10 -; SI-NOFMA-NEXT: s_mov_b32 s3, s11 +; SI-NOFMA-NEXT: s_mov_b32 s15, s3 +; SI-NOFMA-NEXT: s_mov_b32 s10, s2 +; SI-NOFMA-NEXT: s_mov_b32 s11, s3 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-NOFMA-NEXT: s_mov_b32 s8, s4 -; SI-NOFMA-NEXT: s_mov_b32 s9, s5 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NOFMA-NEXT: s_mov_b32 s0, s4 +; SI-NOFMA-NEXT: s_mov_b32 s1, s5 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) ; SI-NOFMA-NEXT: v_add_f32_e32 v0, -1.0, v0 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_add_x_negone_y: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s10, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s10 +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s2, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s2 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-FMA-NEXT: s_mov_b32 s12, s6 ; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s11 -; SI-FMA-NEXT: s_mov_b32 s2, s10 -; SI-FMA-NEXT: s_mov_b32 s3, s11 +; SI-FMA-NEXT: s_mov_b32 s15, s3 +; SI-FMA-NEXT: s_mov_b32 s10, s2 +; SI-FMA-NEXT: s_mov_b32 s11, s3 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-FMA-NEXT: s_mov_b32 s8, s4 -; SI-FMA-NEXT: s_mov_b32 s9, s5 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-FMA-NEXT: s_mov_b32 s0, s4 +; SI-FMA-NEXT: s_mov_b32 s1, s5 ; SI-FMA-NEXT: s_waitcnt vmcnt(0) ; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, -v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_add_x_negone_y: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x1 @@ -1180,8 +1152,8 @@ define amdgpu_kernel void @test_f32_mul_add_x_negone_y(ptr addrspace(1) %out, ; GFX11-FMA-LABEL: test_f32_mul_add_x_negone_y: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -1206,55 +1178,55 @@ define amdgpu_kernel void @test_f32_mul_add_x_negone_y(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_mul_y_add_x_negone(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_y_add_x_negone: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s10, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s10 +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s2, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s2 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-NOFMA-NEXT: s_mov_b32 s12, s6 ; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s11 -; SI-NOFMA-NEXT: s_mov_b32 s2, s10 -; SI-NOFMA-NEXT: s_mov_b32 s3, s11 +; SI-NOFMA-NEXT: s_mov_b32 s15, s3 +; SI-NOFMA-NEXT: s_mov_b32 s10, s2 +; SI-NOFMA-NEXT: s_mov_b32 s11, s3 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-NOFMA-NEXT: s_mov_b32 s8, s4 -; SI-NOFMA-NEXT: s_mov_b32 s9, s5 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NOFMA-NEXT: s_mov_b32 s0, s4 +; SI-NOFMA-NEXT: s_mov_b32 s1, s5 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) ; SI-NOFMA-NEXT: v_add_f32_e32 v0, -1.0, v0 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_y_add_x_negone: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s10, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s10 +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s2, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s2 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-FMA-NEXT: s_mov_b32 s12, s6 ; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s11 -; SI-FMA-NEXT: s_mov_b32 s2, s10 -; SI-FMA-NEXT: s_mov_b32 s3, s11 +; SI-FMA-NEXT: s_mov_b32 s15, s3 +; SI-FMA-NEXT: s_mov_b32 s10, s2 +; SI-FMA-NEXT: s_mov_b32 s11, s3 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-FMA-NEXT: s_mov_b32 s8, s4 -; SI-FMA-NEXT: s_mov_b32 s9, s5 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-FMA-NEXT: s_mov_b32 s0, s4 +; SI-FMA-NEXT: s_mov_b32 s1, s5 ; SI-FMA-NEXT: s_waitcnt vmcnt(0) ; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, -v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_y_add_x_negone: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x1 @@ -1273,8 +1245,8 @@ define amdgpu_kernel void @test_f32_mul_y_add_x_negone(ptr addrspace(1) %out, ; GFX11-FMA-LABEL: test_f32_mul_y_add_x_negone: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -1299,55 +1271,55 @@ define amdgpu_kernel void @test_f32_mul_y_add_x_negone(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_mul_sub_one_x_y(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_sub_one_x_y: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s10, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s10 +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s2, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s2 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-NOFMA-NEXT: s_mov_b32 s12, s6 ; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s11 -; SI-NOFMA-NEXT: s_mov_b32 s2, s10 -; SI-NOFMA-NEXT: s_mov_b32 s3, s11 +; SI-NOFMA-NEXT: s_mov_b32 s15, s3 +; SI-NOFMA-NEXT: s_mov_b32 s10, s2 +; SI-NOFMA-NEXT: s_mov_b32 s11, s3 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-NOFMA-NEXT: s_mov_b32 s8, s4 -; SI-NOFMA-NEXT: s_mov_b32 s9, s5 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NOFMA-NEXT: s_mov_b32 s0, s4 +; SI-NOFMA-NEXT: s_mov_b32 s1, s5 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) ; SI-NOFMA-NEXT: v_sub_f32_e32 v0, 1.0, v0 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_sub_one_x_y: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s10, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s10 +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s2, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s2 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-FMA-NEXT: s_mov_b32 s12, s6 ; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s11 -; SI-FMA-NEXT: s_mov_b32 s2, s10 -; SI-FMA-NEXT: s_mov_b32 s3, s11 +; SI-FMA-NEXT: s_mov_b32 s15, s3 +; SI-FMA-NEXT: s_mov_b32 s10, s2 +; SI-FMA-NEXT: s_mov_b32 s11, s3 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-FMA-NEXT: s_mov_b32 s8, s4 -; SI-FMA-NEXT: s_mov_b32 s9, s5 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-FMA-NEXT: s_mov_b32 s0, s4 +; SI-FMA-NEXT: s_mov_b32 s1, s5 ; SI-FMA-NEXT: s_waitcnt vmcnt(0) ; SI-FMA-NEXT: v_fma_f32 v0, -v0, v1, v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_sub_one_x_y: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x1 @@ -1366,8 +1338,8 @@ define amdgpu_kernel void @test_f32_mul_sub_one_x_y(ptr addrspace(1) %out, ; GFX11-FMA-LABEL: test_f32_mul_sub_one_x_y: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -1392,55 +1364,55 @@ define amdgpu_kernel void @test_f32_mul_sub_one_x_y(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_mul_y_sub_one_x(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_y_sub_one_x: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s10, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s10 +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s2, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s2 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-NOFMA-NEXT: s_mov_b32 s12, s6 ; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s11 -; SI-NOFMA-NEXT: s_mov_b32 s2, s10 -; SI-NOFMA-NEXT: s_mov_b32 s3, s11 +; SI-NOFMA-NEXT: s_mov_b32 s15, s3 +; SI-NOFMA-NEXT: s_mov_b32 s10, s2 +; SI-NOFMA-NEXT: s_mov_b32 s11, s3 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-NOFMA-NEXT: s_mov_b32 s8, s4 -; SI-NOFMA-NEXT: s_mov_b32 s9, s5 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NOFMA-NEXT: s_mov_b32 s0, s4 +; SI-NOFMA-NEXT: s_mov_b32 s1, s5 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) ; SI-NOFMA-NEXT: v_sub_f32_e32 v0, 1.0, v0 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_y_sub_one_x: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s10, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s10 +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s2, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s2 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-FMA-NEXT: s_mov_b32 s12, s6 ; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s11 -; SI-FMA-NEXT: s_mov_b32 s2, s10 -; SI-FMA-NEXT: s_mov_b32 s3, s11 +; SI-FMA-NEXT: s_mov_b32 s15, s3 +; SI-FMA-NEXT: s_mov_b32 s10, s2 +; SI-FMA-NEXT: s_mov_b32 s11, s3 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-FMA-NEXT: s_mov_b32 s8, s4 -; SI-FMA-NEXT: s_mov_b32 s9, s5 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-FMA-NEXT: s_mov_b32 s0, s4 +; SI-FMA-NEXT: s_mov_b32 s1, s5 ; SI-FMA-NEXT: s_waitcnt vmcnt(0) ; SI-FMA-NEXT: v_fma_f32 v0, -v0, v1, v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_y_sub_one_x: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x1 @@ -1459,8 +1431,8 @@ define amdgpu_kernel void @test_f32_mul_y_sub_one_x(ptr addrspace(1) %out, ; GFX11-FMA-LABEL: test_f32_mul_y_sub_one_x: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -1485,55 +1457,55 @@ define amdgpu_kernel void @test_f32_mul_y_sub_one_x(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_sub_negone_x_y: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s10, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s10 +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s2, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s2 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-NOFMA-NEXT: s_mov_b32 s12, s6 ; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s11 -; SI-NOFMA-NEXT: s_mov_b32 s2, s10 -; SI-NOFMA-NEXT: s_mov_b32 s3, s11 +; SI-NOFMA-NEXT: s_mov_b32 s15, s3 +; SI-NOFMA-NEXT: s_mov_b32 s10, s2 +; SI-NOFMA-NEXT: s_mov_b32 s11, s3 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-NOFMA-NEXT: s_mov_b32 s8, s4 -; SI-NOFMA-NEXT: s_mov_b32 s9, s5 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NOFMA-NEXT: s_mov_b32 s0, s4 +; SI-NOFMA-NEXT: s_mov_b32 s1, s5 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) ; SI-NOFMA-NEXT: v_sub_f32_e32 v0, -1.0, v0 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_sub_negone_x_y: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s10, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s10 +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s2, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s2 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-FMA-NEXT: s_mov_b32 s12, s6 ; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s11 -; SI-FMA-NEXT: s_mov_b32 s2, s10 -; SI-FMA-NEXT: s_mov_b32 s3, s11 +; SI-FMA-NEXT: s_mov_b32 s15, s3 +; SI-FMA-NEXT: s_mov_b32 s10, s2 +; SI-FMA-NEXT: s_mov_b32 s11, s3 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-FMA-NEXT: s_mov_b32 s8, s4 -; SI-FMA-NEXT: s_mov_b32 s9, s5 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-FMA-NEXT: s_mov_b32 s0, s4 +; SI-FMA-NEXT: s_mov_b32 s1, s5 ; SI-FMA-NEXT: s_waitcnt vmcnt(0) ; SI-FMA-NEXT: v_fma_f32 v0, -v0, v1, -v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_sub_negone_x_y: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x1 @@ -1552,8 +1524,8 @@ define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(ptr addrspace(1) %out, ; GFX11-FMA-LABEL: test_f32_mul_sub_negone_x_y: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -1578,55 +1550,55 @@ define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_mul_y_sub_negone_x(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_y_sub_negone_x: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s10, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s10 +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s2, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s2 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-NOFMA-NEXT: s_mov_b32 s12, s6 ; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s11 -; SI-NOFMA-NEXT: s_mov_b32 s2, s10 -; SI-NOFMA-NEXT: s_mov_b32 s3, s11 +; SI-NOFMA-NEXT: s_mov_b32 s15, s3 +; SI-NOFMA-NEXT: s_mov_b32 s10, s2 +; SI-NOFMA-NEXT: s_mov_b32 s11, s3 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-NOFMA-NEXT: s_mov_b32 s8, s4 -; SI-NOFMA-NEXT: s_mov_b32 s9, s5 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NOFMA-NEXT: s_mov_b32 s0, s4 +; SI-NOFMA-NEXT: s_mov_b32 s1, s5 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) ; SI-NOFMA-NEXT: v_sub_f32_e32 v0, -1.0, v0 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_y_sub_negone_x: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s10, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s10 +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s2, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s2 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-FMA-NEXT: s_mov_b32 s12, s6 ; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s11 -; SI-FMA-NEXT: s_mov_b32 s2, s10 -; SI-FMA-NEXT: s_mov_b32 s3, s11 +; SI-FMA-NEXT: s_mov_b32 s15, s3 +; SI-FMA-NEXT: s_mov_b32 s10, s2 +; SI-FMA-NEXT: s_mov_b32 s11, s3 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-FMA-NEXT: s_mov_b32 s8, s4 -; SI-FMA-NEXT: s_mov_b32 s9, s5 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-FMA-NEXT: s_mov_b32 s0, s4 +; SI-FMA-NEXT: s_mov_b32 s1, s5 ; SI-FMA-NEXT: s_waitcnt vmcnt(0) ; SI-FMA-NEXT: v_fma_f32 v0, -v0, v1, -v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_y_sub_negone_x: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x1 @@ -1645,8 +1617,8 @@ define amdgpu_kernel void @test_f32_mul_y_sub_negone_x(ptr addrspace(1) %out, ; GFX11-FMA-LABEL: test_f32_mul_y_sub_negone_x: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -1671,55 +1643,55 @@ define amdgpu_kernel void @test_f32_mul_y_sub_negone_x(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_mul_sub_x_one_y(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_sub_x_one_y: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s10, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s10 +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s2, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s2 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-NOFMA-NEXT: s_mov_b32 s12, s6 ; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s11 -; SI-NOFMA-NEXT: s_mov_b32 s2, s10 -; SI-NOFMA-NEXT: s_mov_b32 s3, s11 +; SI-NOFMA-NEXT: s_mov_b32 s15, s3 +; SI-NOFMA-NEXT: s_mov_b32 s10, s2 +; SI-NOFMA-NEXT: s_mov_b32 s11, s3 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-NOFMA-NEXT: s_mov_b32 s8, s4 -; SI-NOFMA-NEXT: s_mov_b32 s9, s5 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NOFMA-NEXT: s_mov_b32 s0, s4 +; SI-NOFMA-NEXT: s_mov_b32 s1, s5 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) ; SI-NOFMA-NEXT: v_add_f32_e32 v0, -1.0, v0 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_sub_x_one_y: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s10, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s10 +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s2, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s2 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-FMA-NEXT: s_mov_b32 s12, s6 ; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s11 -; SI-FMA-NEXT: s_mov_b32 s2, s10 -; SI-FMA-NEXT: s_mov_b32 s3, s11 +; SI-FMA-NEXT: s_mov_b32 s15, s3 +; SI-FMA-NEXT: s_mov_b32 s10, s2 +; SI-FMA-NEXT: s_mov_b32 s11, s3 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-FMA-NEXT: s_mov_b32 s8, s4 -; SI-FMA-NEXT: s_mov_b32 s9, s5 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-FMA-NEXT: s_mov_b32 s0, s4 +; SI-FMA-NEXT: s_mov_b32 s1, s5 ; SI-FMA-NEXT: s_waitcnt vmcnt(0) ; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, -v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_sub_x_one_y: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x1 @@ -1738,8 +1710,8 @@ define amdgpu_kernel void @test_f32_mul_sub_x_one_y(ptr addrspace(1) %out, ; GFX11-FMA-LABEL: test_f32_mul_sub_x_one_y: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -1764,55 +1736,55 @@ define amdgpu_kernel void @test_f32_mul_sub_x_one_y(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_mul_y_sub_x_one(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_y_sub_x_one: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s10, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s10 +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s2, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s2 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-NOFMA-NEXT: s_mov_b32 s12, s6 ; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s11 -; SI-NOFMA-NEXT: s_mov_b32 s2, s10 -; SI-NOFMA-NEXT: s_mov_b32 s3, s11 +; SI-NOFMA-NEXT: s_mov_b32 s15, s3 +; SI-NOFMA-NEXT: s_mov_b32 s10, s2 +; SI-NOFMA-NEXT: s_mov_b32 s11, s3 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-NOFMA-NEXT: s_mov_b32 s8, s4 -; SI-NOFMA-NEXT: s_mov_b32 s9, s5 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NOFMA-NEXT: s_mov_b32 s0, s4 +; SI-NOFMA-NEXT: s_mov_b32 s1, s5 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) ; SI-NOFMA-NEXT: v_add_f32_e32 v0, -1.0, v0 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_y_sub_x_one: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s10, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s10 +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s2, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s2 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-FMA-NEXT: s_mov_b32 s12, s6 ; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s11 -; SI-FMA-NEXT: s_mov_b32 s2, s10 -; SI-FMA-NEXT: s_mov_b32 s3, s11 +; SI-FMA-NEXT: s_mov_b32 s15, s3 +; SI-FMA-NEXT: s_mov_b32 s10, s2 +; SI-FMA-NEXT: s_mov_b32 s11, s3 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-FMA-NEXT: s_mov_b32 s8, s4 -; SI-FMA-NEXT: s_mov_b32 s9, s5 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-FMA-NEXT: s_mov_b32 s0, s4 +; SI-FMA-NEXT: s_mov_b32 s1, s5 ; SI-FMA-NEXT: s_waitcnt vmcnt(0) ; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, -v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_y_sub_x_one: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x1 @@ -1831,8 +1803,8 @@ define amdgpu_kernel void @test_f32_mul_y_sub_x_one(ptr addrspace(1) %out, ; GFX11-FMA-LABEL: test_f32_mul_y_sub_x_one: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -1857,55 +1829,55 @@ define amdgpu_kernel void @test_f32_mul_y_sub_x_one(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_mul_sub_x_negone_y(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_sub_x_negone_y: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s10, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s10 +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s2, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s2 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-NOFMA-NEXT: s_mov_b32 s12, s6 ; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s11 -; SI-NOFMA-NEXT: s_mov_b32 s2, s10 -; SI-NOFMA-NEXT: s_mov_b32 s3, s11 +; SI-NOFMA-NEXT: s_mov_b32 s15, s3 +; SI-NOFMA-NEXT: s_mov_b32 s10, s2 +; SI-NOFMA-NEXT: s_mov_b32 s11, s3 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-NOFMA-NEXT: s_mov_b32 s8, s4 -; SI-NOFMA-NEXT: s_mov_b32 s9, s5 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NOFMA-NEXT: s_mov_b32 s0, s4 +; SI-NOFMA-NEXT: s_mov_b32 s1, s5 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) ; SI-NOFMA-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_sub_x_negone_y: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s10, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s10 +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s2, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s2 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-FMA-NEXT: s_mov_b32 s12, s6 ; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s11 -; SI-FMA-NEXT: s_mov_b32 s2, s10 -; SI-FMA-NEXT: s_mov_b32 s3, s11 +; SI-FMA-NEXT: s_mov_b32 s15, s3 +; SI-FMA-NEXT: s_mov_b32 s10, s2 +; SI-FMA-NEXT: s_mov_b32 s11, s3 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-FMA-NEXT: s_mov_b32 s8, s4 -; SI-FMA-NEXT: s_mov_b32 s9, s5 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-FMA-NEXT: s_mov_b32 s0, s4 +; SI-FMA-NEXT: s_mov_b32 s1, s5 ; SI-FMA-NEXT: s_waitcnt vmcnt(0) ; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_sub_x_negone_y: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x1 @@ -1924,8 +1896,8 @@ define amdgpu_kernel void @test_f32_mul_sub_x_negone_y(ptr addrspace(1) %out, ; GFX11-FMA-LABEL: test_f32_mul_sub_x_negone_y: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -1950,55 +1922,55 @@ define amdgpu_kernel void @test_f32_mul_sub_x_negone_y(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_mul_y_sub_x_negone(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_y_sub_x_negone: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s10, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s10 +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s2, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s2 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-NOFMA-NEXT: s_mov_b32 s12, s6 ; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s11 -; SI-NOFMA-NEXT: s_mov_b32 s2, s10 -; SI-NOFMA-NEXT: s_mov_b32 s3, s11 +; SI-NOFMA-NEXT: s_mov_b32 s15, s3 +; SI-NOFMA-NEXT: s_mov_b32 s10, s2 +; SI-NOFMA-NEXT: s_mov_b32 s11, s3 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-NOFMA-NEXT: s_mov_b32 s8, s4 -; SI-NOFMA-NEXT: s_mov_b32 s9, s5 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NOFMA-NEXT: s_mov_b32 s0, s4 +; SI-NOFMA-NEXT: s_mov_b32 s1, s5 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) ; SI-NOFMA-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_y_sub_x_negone: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s10, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s10 +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s2, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s2 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-FMA-NEXT: s_mov_b32 s12, s6 ; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s11 -; SI-FMA-NEXT: s_mov_b32 s2, s10 -; SI-FMA-NEXT: s_mov_b32 s3, s11 +; SI-FMA-NEXT: s_mov_b32 s15, s3 +; SI-FMA-NEXT: s_mov_b32 s10, s2 +; SI-FMA-NEXT: s_mov_b32 s11, s3 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-FMA-NEXT: s_mov_b32 s8, s4 -; SI-FMA-NEXT: s_mov_b32 s9, s5 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-FMA-NEXT: s_mov_b32 s0, s4 +; SI-FMA-NEXT: s_mov_b32 s1, s5 ; SI-FMA-NEXT: s_waitcnt vmcnt(0) ; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_y_sub_x_negone: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x1 @@ -2017,8 +1989,8 @@ define amdgpu_kernel void @test_f32_mul_y_sub_x_negone(ptr addrspace(1) %out, ; GFX11-FMA-LABEL: test_f32_mul_y_sub_x_negone: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -2047,7 +2019,7 @@ define amdgpu_kernel void @test_f32_mul_y_sub_x_negone(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_interp(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_interp: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 ; SI-NOFMA-NEXT: s_mov_b32 s10, -1 ; SI-NOFMA-NEXT: s_mov_b32 s14, s10 @@ -2079,7 +2051,7 @@ define amdgpu_kernel void @test_f32_interp(ptr addrspace(1) %out, ; ; SI-FMA-LABEL: test_f32_interp: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-FMA-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 ; SI-FMA-NEXT: s_mov_b32 s10, -1 ; SI-FMA-NEXT: s_mov_b32 s18, s10 @@ -2109,7 +2081,7 @@ define amdgpu_kernel void @test_f32_interp(ptr addrspace(1) %out, ; ; GFX11-NOFMA-LABEL: test_f32_interp: ; GFX11-NOFMA: ; %bb.0: -; GFX11-NOFMA-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NOFMA-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x2 @@ -2130,7 +2102,7 @@ define amdgpu_kernel void @test_f32_interp(ptr addrspace(1) %out, ; ; GFX11-FMA-LABEL: test_f32_interp: ; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-FMA-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x2 @@ -2163,7 +2135,7 @@ define amdgpu_kernel void @test_f32_interp(ptr addrspace(1) %out, define amdgpu_kernel void @test_f64_interp(ptr addrspace(1) %out, ; SI-FMA-LABEL: test_f64_interp: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-FMA-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 ; SI-FMA-NEXT: s_mov_b32 s10, -1 ; SI-FMA-NEXT: s_mov_b32 s18, s10 @@ -2193,7 +2165,7 @@ define amdgpu_kernel void @test_f64_interp(ptr addrspace(1) %out, ; ; GFX11-NOFMA-LABEL: test_f64_interp: ; GFX11-NOFMA: ; %bb.0: -; GFX11-NOFMA-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NOFMA-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v8, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x2 @@ -2214,7 +2186,7 @@ define amdgpu_kernel void @test_f64_interp(ptr addrspace(1) %out, ; ; GFX11-FMA-LABEL: test_f64_interp: ; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-FMA-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-FMA-NEXT: v_mov_b32_e32 v6, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x2 @@ -2248,7 +2220,7 @@ define amdgpu_kernel void @test_f64_interp(ptr addrspace(1) %out, define amdgpu_kernel void @fma_neg_2.0_neg_a_b_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: fma_neg_2.0_neg_a_b_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2264,9 +2236,7 @@ define amdgpu_kernel void @fma_neg_2.0_neg_a_b_f32(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: fma_neg_2.0_neg_a_b_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc @@ -2296,7 +2266,7 @@ define amdgpu_kernel void @fma_neg_2.0_neg_a_b_f32(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @fma_2.0_neg_a_b_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: fma_2.0_neg_a_b_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2312,9 +2282,7 @@ define amdgpu_kernel void @fma_2.0_neg_a_b_f32(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-LABEL: fma_2.0_neg_a_b_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc @@ -2344,7 +2312,7 @@ define amdgpu_kernel void @fma_2.0_neg_a_b_f32(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @fma_neg_b_c_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #2 { ; SI-LABEL: fma_neg_b_c_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v12, 4, v0 @@ -2365,9 +2333,7 @@ define amdgpu_kernel void @fma_neg_b_c_v4f32(ptr addrspace(1) %out, ptr addrspac ; ; GFX11-LABEL: fma_neg_b_c_v4f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v12, 4, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x2 diff --git a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll index c60b9858abd83..7830c91851bfa 100644 --- a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll +++ b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll @@ -15,7 +15,7 @@ declare float @llvm.fabs.f32(float) #1 define amdgpu_kernel void @multiple_fadd_use_test_f32(ptr addrspace(1) %out, float %x, float %y, float %z) #0 { ; VI-LABEL: multiple_fadd_use_test_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_f32_e64 v0, s3, -1.0 ; VI-NEXT: v_add_f32_e64 v1, s2, -1.0 @@ -31,7 +31,7 @@ define amdgpu_kernel void @multiple_fadd_use_test_f32(ptr addrspace(1) %out, flo ; ; GFX10-LABEL: multiple_fadd_use_test_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_f32_e64 v0, s3, -1.0 @@ -46,7 +46,7 @@ define amdgpu_kernel void @multiple_fadd_use_test_f32(ptr addrspace(1) %out, flo ; ; GFX11-LABEL: multiple_fadd_use_test_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_f32_e64 v0, s3, -1.0 @@ -79,20 +79,20 @@ define amdgpu_kernel void @multiple_fadd_use_test_f32(ptr addrspace(1) %out, flo define amdgpu_kernel void @multiple_use_fadd_fmac_f32(ptr addrspace(1) %out, float %x, [8 x i32], float %y) #0 { ; VI-LABEL: multiple_use_fadd_fmac_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x8 -; VI-NEXT: s_load_dword s3, s[6:7], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 +; VI-NEXT: s_load_dword s3, s[4:5], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_add_u32 s2, s0, 4 -; VI-NEXT: v_add_f32_e64 v2, s4, s4 +; VI-NEXT: v_add_f32_e64 v2, s6, s6 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mac_f32_e64 v3, s4, 2.0 +; VI-NEXT: v_mac_f32_e64 v3, s6, 2.0 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v3 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -101,9 +101,9 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f32(ptr addrspace(1) %out, flo ; GFX10-LABEL: multiple_use_fadd_fmac_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX10-NEXT: s_load_dword s3, s[6:7], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-NEXT: s_load_dword s3, s[4:5], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_f32_e64 v1, s2, s2 @@ -117,13 +117,13 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f32(ptr addrspace(1) %out, flo ; GFX11-LABEL: multiple_use_fadd_fmac_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_add_f32_e64 v1, s4, s4 -; GFX11-NEXT: v_fma_f32 v2, s4, 2.0, s5 +; GFX11-NEXT: v_add_f32_e64 v1, s2, s2 +; GFX11-NEXT: v_fma_f32 v2, s2, 2.0, s3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] offset:4 dlc @@ -142,7 +142,7 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f32(ptr addrspace(1) %out, flo define amdgpu_kernel void @multiple_use_fadd_fmad_f32(ptr addrspace(1) %out, float %x, float %y) #0 { ; VI-LABEL: multiple_use_fadd_fmad_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_add_u32 s4, s0, 4 @@ -161,7 +161,7 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f32(ptr addrspace(1) %out, flo ; ; GFX10-LABEL: multiple_use_fadd_fmad_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_f32_e64 v1, |s2|, |s2| @@ -174,7 +174,7 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f32(ptr addrspace(1) %out, flo ; ; GFX11-LABEL: multiple_use_fadd_fmad_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_f32_e64 v1, |s2|, |s2| @@ -198,21 +198,21 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f32(ptr addrspace(1) %out, flo define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f32(ptr addrspace(1) %out, float %x, float %y, float %z) #0 { ; VI-LABEL: multiple_use_fadd_multi_fmad_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s6, s4, 4 +; VI-NEXT: s_add_u32 s4, s6, 4 ; VI-NEXT: v_mov_b32_e32 v0, s1 ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: v_mad_f32 v2, |s0|, 2.0, v0 ; VI-NEXT: v_mad_f32 v3, |s0|, 2.0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: s_addc_u32 s7, s5, 0 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: s_addc_u32 s5, s7, 0 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_dword v[0:1], v3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_endpgm @@ -220,23 +220,23 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f32(ptr addrspace(1) %ou ; GFX10-LABEL: multiple_use_fadd_multi_fmad_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_fma_f32 v1, |s0|, 2.0, s1 ; GFX10-NEXT: v_fma_f32 v2, |s0|, 2.0, s2 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[6:7] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_store_dword v0, v2, s[4:5] offset:4 +; GFX10-NEXT: global_store_dword v0, v2, s[6:7] offset:4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: multiple_use_fadd_multi_fmad_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_fma_f32 v1, |s4|, 2.0, s5 @@ -261,8 +261,8 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f32(ptr addrspace(1) %ou define amdgpu_kernel void @fmul_x2_xn2_f32(ptr addrspace(1) %out, float %x, float %y) #0 { ; VI-LABEL: fmul_x2_xn2_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mul_f32_e64 v0, s2, -4.0 ; VI-NEXT: v_mul_f32_e32 v2, s2, v0 @@ -275,8 +275,8 @@ define amdgpu_kernel void @fmul_x2_xn2_f32(ptr addrspace(1) %out, float %x, floa ; GFX10-LABEL: fmul_x2_xn2_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mul_f32_e64 v0, s2, -4.0 @@ -288,12 +288,12 @@ define amdgpu_kernel void @fmul_x2_xn2_f32(ptr addrspace(1) %out, float %x, floa ; GFX11-LABEL: fmul_x2_xn2_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mul_f32_e64 v0, s4, -4.0 +; GFX11-NEXT: v_mul_f32_e64 v0, s2, -4.0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, s4, v0 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, s2, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_nop 0 @@ -310,8 +310,8 @@ define amdgpu_kernel void @fmul_x2_xn2_f32(ptr addrspace(1) %out, float %x, floa define amdgpu_kernel void @fmul_x2_xn3_f32(ptr addrspace(1) %out, float %x, float %y) #0 { ; VI-LABEL: fmul_x2_xn3_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0xc0c00000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mul_f32_e32 v0, s2, v0 @@ -325,8 +325,8 @@ define amdgpu_kernel void @fmul_x2_xn3_f32(ptr addrspace(1) %out, float %x, floa ; GFX10-LABEL: fmul_x2_xn3_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mul_f32_e64 v0, 0xc0c00000, s2 @@ -338,12 +338,12 @@ define amdgpu_kernel void @fmul_x2_xn3_f32(ptr addrspace(1) %out, float %x, floa ; GFX11-LABEL: fmul_x2_xn3_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mul_f32_e64 v0, 0xc0c00000, s4 +; GFX11-NEXT: v_mul_f32_e64 v0, 0xc0c00000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, s4, v0 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, s2, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_nop 0 @@ -360,8 +360,8 @@ define amdgpu_kernel void @fmul_x2_xn3_f32(ptr addrspace(1) %out, float %x, floa define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16 zeroext %x.arg, i16 zeroext %y.arg, i16 zeroext %z.arg) #0 { ; VI-DENORM-LABEL: multiple_fadd_use_test_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-DENORM-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: s_lshr_b32 s3, s2, 16 ; VI-DENORM-NEXT: v_add_f16_e64 v0, s2, -1.0 @@ -378,8 +378,8 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16 ; ; VI-FLUSH-LABEL: multiple_fadd_use_test_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-FLUSH-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: s_lshr_b32 s3, s2, 16 ; VI-FLUSH-NEXT: v_add_f16_e64 v0, s2, -1.0 @@ -396,13 +396,13 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16 ; ; GFX10-DENORM-LABEL: multiple_fadd_use_test_f16: ; GFX10-DENORM: ; %bb.0: -; GFX10-DENORM-NEXT: s_load_dword s0, s[6:7], 0x8 +; GFX10-DENORM-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX10-DENORM-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: s_lshr_b32 s1, s0, 16 ; GFX10-DENORM-NEXT: v_add_f16_e64 v0, s0, -1.0 ; GFX10-DENORM-NEXT: v_add_f16_e64 v1, s1, -1.0 -; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-DENORM-NEXT: v_cmp_gt_f16_e64 vcc_lo, |v1|, |v0| ; GFX10-DENORM-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX10-DENORM-NEXT: v_add_f16_e64 v0, |v0|, |v0| @@ -414,12 +414,12 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16 ; ; GFX10-FLUSH-LABEL: multiple_fadd_use_test_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dword s0, s[6:7], 0x8 +; GFX10-FLUSH-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: s_lshr_b32 s1, s0, 16 ; GFX10-FLUSH-NEXT: v_add_f16_e64 v0, s0, -1.0 ; GFX10-FLUSH-NEXT: v_add_f16_e64 v1, s1, -1.0 -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-FLUSH-NEXT: v_cmp_gt_f16_e64 vcc_lo, |v1|, |v0| ; GFX10-FLUSH-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX10-FLUSH-NEXT: v_add_f16_e64 v0, |v0|, |v0| @@ -433,13 +433,14 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16 ; ; GFX11-DENORM-LABEL: multiple_fadd_use_test_f16: ; GFX11-DENORM: ; %bb.0: -; GFX11-DENORM-NEXT: s_load_b32 s0, s[2:3], 0x8 +; GFX11-DENORM-NEXT: s_clause 0x1 +; GFX11-DENORM-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-DENORM-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-DENORM-NEXT: s_lshr_b32 s1, s0, 16 -; GFX11-DENORM-NEXT: v_add_f16_e64 v0, s0, -1.0 -; GFX11-DENORM-NEXT: v_add_f16_e64 v1, s1, -1.0 -; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-DENORM-NEXT: s_lshr_b32 s3, s2, 16 +; GFX11-DENORM-NEXT: v_add_f16_e64 v0, s2, -1.0 +; GFX11-DENORM-NEXT: v_add_f16_e64 v1, s3, -1.0 ; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-DENORM-NEXT: v_cmp_gt_f16_e64 vcc_lo, |v1|, |v0| ; GFX11-DENORM-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo @@ -447,7 +448,6 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16 ; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-DENORM-NEXT: v_mul_f16_e32 v1, v0, v0 ; GFX11-DENORM-NEXT: v_fma_f16 v0, -v1, v0, 1.0 -; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-NEXT: global_store_b16 v2, v0, s[0:1] ; GFX11-DENORM-NEXT: s_nop 0 ; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -455,12 +455,13 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16 ; ; GFX11-FLUSH-LABEL: multiple_fadd_use_test_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b32 s0, s[2:3], 0x8 +; GFX11-FLUSH-NEXT: s_clause 0x1 +; GFX11-FLUSH-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FLUSH-NEXT: s_lshr_b32 s1, s0, 16 -; GFX11-FLUSH-NEXT: v_add_f16_e64 v0, s0, -1.0 -; GFX11-FLUSH-NEXT: v_add_f16_e64 v1, s1, -1.0 -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-FLUSH-NEXT: s_lshr_b32 s3, s2, 16 +; GFX11-FLUSH-NEXT: v_add_f16_e64 v0, s2, -1.0 +; GFX11-FLUSH-NEXT: v_add_f16_e64 v1, s3, -1.0 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_cmp_gt_f16_e64 vcc_lo, |v1|, |v0| ; GFX11-FLUSH-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo @@ -471,7 +472,6 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16 ; GFX11-FLUSH-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-FLUSH-NEXT: v_sub_f16_e32 v0, 1.0, v0 -; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_store_b16 v1, v0, s[0:1] ; GFX11-FLUSH-NEXT: s_nop 0 ; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -496,14 +496,14 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16 define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 { ; VI-DENORM-LABEL: multiple_use_fadd_fmac_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dword s4, s[6:7], 0x8 -; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-DENORM-NEXT: s_load_dword s6, s[4:5], 0x8 +; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; VI-DENORM-NEXT: s_lshr_b32 s3, s4, 16 +; VI-DENORM-NEXT: s_lshr_b32 s3, s6, 16 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s3 -; VI-DENORM-NEXT: v_fma_f16 v3, s4, 2.0, v0 +; VI-DENORM-NEXT: v_fma_f16 v3, s6, 2.0, v0 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s0 -; VI-DENORM-NEXT: v_add_f16_e64 v2, s4, s4 +; VI-DENORM-NEXT: v_add_f16_e64 v2, s6, s6 ; VI-DENORM-NEXT: s_add_u32 s2, s0, 2 ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1 ; VI-DENORM-NEXT: s_addc_u32 s3, s1, 0 @@ -517,12 +517,12 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 ; ; VI-FLUSH-LABEL: multiple_use_fadd_fmac_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dword s4, s[6:7], 0x8 -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-FLUSH-NEXT: s_load_dword s6, s[4:5], 0x8 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; VI-FLUSH-NEXT: s_lshr_b32 s3, s4, 16 +; VI-FLUSH-NEXT: s_lshr_b32 s3, s6, 16 ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s0 -; VI-FLUSH-NEXT: v_add_f16_e64 v2, s4, s4 +; VI-FLUSH-NEXT: v_add_f16_e64 v2, s6, s6 ; VI-FLUSH-NEXT: s_add_u32 s2, s0, 2 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 ; VI-FLUSH-NEXT: v_mov_b32_e32 v3, s3 @@ -530,7 +530,7 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 ; VI-FLUSH-NEXT: flat_store_short v[0:1], v2 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s2 -; VI-FLUSH-NEXT: v_mac_f16_e64 v3, s4, 2.0 +; VI-FLUSH-NEXT: v_mac_f16_e64 v3, s6, 2.0 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 ; VI-FLUSH-NEXT: flat_store_short v[0:1], v3 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) @@ -539,8 +539,8 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 ; GFX10-DENORM-LABEL: multiple_use_fadd_fmac_f16: ; GFX10-DENORM: ; %bb.0: ; GFX10-DENORM-NEXT: s_clause 0x1 -; GFX10-DENORM-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-DENORM-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: s_lshr_b32 s3, s2, 16 @@ -555,8 +555,8 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 ; GFX10-FLUSH-LABEL: multiple_use_fadd_fmac_f16: ; GFX10-FLUSH: ; %bb.0: ; GFX10-FLUSH-NEXT: s_clause 0x1 -; GFX10-FLUSH-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-FLUSH-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-FLUSH-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: v_add_f16_e64 v0, s2, s2 @@ -571,13 +571,13 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 ; GFX11-DENORM-LABEL: multiple_use_fadd_fmac_f16: ; GFX11-DENORM: ; %bb.0: ; GFX11-DENORM-NEXT: s_clause 0x1 -; GFX11-DENORM-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-DENORM-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-DENORM-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-DENORM-NEXT: s_lshr_b32 s2, s4, 16 -; GFX11-DENORM-NEXT: v_add_f16_e64 v1, s4, s4 -; GFX11-DENORM-NEXT: v_fma_f16 v2, s4, 2.0, s2 +; GFX11-DENORM-NEXT: s_lshr_b32 s3, s2, 16 +; GFX11-DENORM-NEXT: v_add_f16_e64 v1, s2, s2 +; GFX11-DENORM-NEXT: v_fma_f16 v2, s2, 2.0, s3 ; GFX11-DENORM-NEXT: global_store_b16 v0, v1, s[0:1] dlc ; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[0:1] offset:2 dlc @@ -589,12 +589,12 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 ; GFX11-FLUSH-LABEL: multiple_use_fadd_fmac_f16: ; GFX11-FLUSH: ; %bb.0: ; GFX11-FLUSH-NEXT: s_clause 0x1 -; GFX11-FLUSH-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-FLUSH-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-FLUSH-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FLUSH-NEXT: v_add_f16_e64 v0, s4, s4 -; GFX11-FLUSH-NEXT: s_lshr_b32 s2, s4, 16 +; GFX11-FLUSH-NEXT: v_add_f16_e64 v0, s2, s2 +; GFX11-FLUSH-NEXT: s_lshr_b32 s2, s2, 16 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-FLUSH-NEXT: v_add_f16_e32 v2, s2, v0 ; GFX11-FLUSH-NEXT: global_store_b16 v1, v0, s[0:1] dlc @@ -617,14 +617,14 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 { ; VI-DENORM-LABEL: multiple_use_fadd_fmad_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dword s4, s[6:7], 0x8 -; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-DENORM-NEXT: s_load_dword s6, s[4:5], 0x8 +; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; VI-DENORM-NEXT: s_lshr_b32 s3, s4, 16 +; VI-DENORM-NEXT: s_lshr_b32 s3, s6, 16 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s3 -; VI-DENORM-NEXT: v_fma_f16 v3, |s4|, 2.0, v0 +; VI-DENORM-NEXT: v_fma_f16 v3, |s6|, 2.0, v0 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s0 -; VI-DENORM-NEXT: v_add_f16_e64 v2, |s4|, |s4| +; VI-DENORM-NEXT: v_add_f16_e64 v2, |s6|, |s6| ; VI-DENORM-NEXT: s_add_u32 s2, s0, 2 ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1 ; VI-DENORM-NEXT: s_addc_u32 s3, s1, 0 @@ -638,14 +638,14 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16 ; ; VI-FLUSH-LABEL: multiple_use_fadd_fmad_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dword s4, s[6:7], 0x8 -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-FLUSH-NEXT: s_load_dword s6, s[4:5], 0x8 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; VI-FLUSH-NEXT: s_lshr_b32 s3, s4, 16 +; VI-FLUSH-NEXT: s_lshr_b32 s3, s6, 16 ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s3 -; VI-FLUSH-NEXT: v_mad_f16 v3, |s4|, 2.0, v0 +; VI-FLUSH-NEXT: v_mad_f16 v3, |s6|, 2.0, v0 ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s0 -; VI-FLUSH-NEXT: v_add_f16_e64 v2, |s4|, |s4| +; VI-FLUSH-NEXT: v_add_f16_e64 v2, |s6|, |s6| ; VI-FLUSH-NEXT: s_add_u32 s2, s0, 2 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 ; VI-FLUSH-NEXT: s_addc_u32 s3, s1, 0 @@ -660,8 +660,8 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16 ; GFX10-DENORM-LABEL: multiple_use_fadd_fmad_f16: ; GFX10-DENORM: ; %bb.0: ; GFX10-DENORM-NEXT: s_clause 0x1 -; GFX10-DENORM-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-DENORM-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: s_lshr_b32 s3, s2, 16 @@ -676,8 +676,8 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16 ; GFX10-FLUSH-LABEL: multiple_use_fadd_fmad_f16: ; GFX10-FLUSH: ; %bb.0: ; GFX10-FLUSH-NEXT: s_clause 0x1 -; GFX10-FLUSH-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-FLUSH-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-FLUSH-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: v_add_f16_e64 v0, |s2|, |s2| @@ -692,13 +692,13 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16 ; GFX11-DENORM-LABEL: multiple_use_fadd_fmad_f16: ; GFX11-DENORM: ; %bb.0: ; GFX11-DENORM-NEXT: s_clause 0x1 -; GFX11-DENORM-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-DENORM-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-DENORM-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-DENORM-NEXT: s_lshr_b32 s2, s4, 16 -; GFX11-DENORM-NEXT: v_add_f16_e64 v1, |s4|, |s4| -; GFX11-DENORM-NEXT: v_fma_f16 v2, |s4|, 2.0, s2 +; GFX11-DENORM-NEXT: s_lshr_b32 s3, s2, 16 +; GFX11-DENORM-NEXT: v_add_f16_e64 v1, |s2|, |s2| +; GFX11-DENORM-NEXT: v_fma_f16 v2, |s2|, 2.0, s3 ; GFX11-DENORM-NEXT: global_store_b16 v0, v1, s[0:1] dlc ; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[0:1] offset:2 dlc @@ -710,12 +710,12 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16 ; GFX11-FLUSH-LABEL: multiple_use_fadd_fmad_f16: ; GFX11-FLUSH: ; %bb.0: ; GFX11-FLUSH-NEXT: s_clause 0x1 -; GFX11-FLUSH-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-FLUSH-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-FLUSH-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FLUSH-NEXT: v_add_f16_e64 v0, |s4|, |s4| -; GFX11-FLUSH-NEXT: s_lshr_b32 s2, s4, 16 +; GFX11-FLUSH-NEXT: v_add_f16_e64 v0, |s2|, |s2| +; GFX11-FLUSH-NEXT: s_lshr_b32 s2, s2, 16 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-FLUSH-NEXT: v_add_f16_e32 v2, s2, v0 ; GFX11-FLUSH-NEXT: global_store_b16 v1, v0, s[0:1] dlc @@ -739,9 +739,9 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16 define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %out, i16 zeroext %x.arg, i16 zeroext %y.arg, i16 zeroext %z.arg) #0 { ; VI-DENORM-LABEL: multiple_use_fadd_multi_fmad_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 -; VI-DENORM-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 -; VI-DENORM-NEXT: s_load_dword s6, s[6:7], 0x8 +; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; VI-DENORM-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; VI-DENORM-NEXT: s_load_dword s6, s[4:5], 0x8 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: s_lshr_b32 s0, s0, 16 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s0 @@ -762,9 +762,9 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou ; ; VI-FLUSH-LABEL: multiple_use_fadd_multi_fmad_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 -; VI-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 -; VI-FLUSH-NEXT: s_load_dword s6, s[6:7], 0x8 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; VI-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; VI-FLUSH-NEXT: s_load_dword s6, s[4:5], 0x8 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: s_lshr_b32 s0, s0, 16 ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s0 @@ -786,14 +786,14 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou ; GFX10-DENORM-LABEL: multiple_use_fadd_multi_fmad_f16: ; GFX10-DENORM: ; %bb.0: ; GFX10-DENORM-NEXT: s_clause 0x2 -; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 -; GFX10-DENORM-NEXT: s_load_dword s4, s[6:7], 0x8 -; GFX10-DENORM-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-DENORM-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: s_lshr_b32 s0, s0, 16 -; GFX10-DENORM-NEXT: v_fma_f16 v2, |s4|, 2.0, s1 -; GFX10-DENORM-NEXT: v_fma_f16 v1, |s4|, 2.0, s0 +; GFX10-DENORM-NEXT: v_fma_f16 v2, |s6|, 2.0, s1 +; GFX10-DENORM-NEXT: v_fma_f16 v1, |s6|, 2.0, s0 ; GFX10-DENORM-NEXT: global_store_short v0, v1, s[2:3] ; GFX10-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-DENORM-NEXT: global_store_short v0, v2, s[2:3] offset:2 @@ -803,12 +803,12 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou ; GFX10-FLUSH-LABEL: multiple_use_fadd_multi_fmad_f16: ; GFX10-FLUSH: ; %bb.0: ; GFX10-FLUSH-NEXT: s_clause 0x2 -; GFX10-FLUSH-NEXT: s_load_dword s4, s[6:7], 0x8 -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 +; GFX10-FLUSH-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-FLUSH-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-FLUSH-NEXT: v_add_f16_e64 v0, |s4|, |s4| +; GFX10-FLUSH-NEXT: v_add_f16_e64 v0, |s6|, |s6| ; GFX10-FLUSH-NEXT: s_lshr_b32 s0, s0, 16 ; GFX10-FLUSH-NEXT: v_add_f16_e32 v2, s0, v0 ; GFX10-FLUSH-NEXT: v_add_f16_e32 v0, s1, v0 @@ -821,17 +821,17 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou ; GFX11-DENORM-LABEL: multiple_use_fadd_multi_fmad_f16: ; GFX11-DENORM: ; %bb.0: ; GFX11-DENORM-NEXT: s_clause 0x2 -; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 -; GFX11-DENORM-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-DENORM-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-DENORM-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 +; GFX11-DENORM-NEXT: s_load_b32 s4, s[0:1], 0x8 +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-DENORM-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-DENORM-NEXT: s_lshr_b32 s0, s0, 16 -; GFX11-DENORM-NEXT: v_fma_f16 v2, |s4|, 2.0, s1 -; GFX11-DENORM-NEXT: v_fma_f16 v1, |s4|, 2.0, s0 -; GFX11-DENORM-NEXT: global_store_b16 v0, v1, s[2:3] dlc +; GFX11-DENORM-NEXT: s_lshr_b32 s2, s2, 16 +; GFX11-DENORM-NEXT: v_fma_f16 v2, |s4|, 2.0, s3 +; GFX11-DENORM-NEXT: v_fma_f16 v1, |s4|, 2.0, s2 +; GFX11-DENORM-NEXT: global_store_b16 v0, v1, s[0:1] dlc ; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[2:3] offset:2 dlc +; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[0:1] offset:2 dlc ; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-DENORM-NEXT: s_nop 0 ; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -840,19 +840,19 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou ; GFX11-FLUSH-LABEL: multiple_use_fadd_multi_fmad_f16: ; GFX11-FLUSH: ; %bb.0: ; GFX11-FLUSH-NEXT: s_clause 0x2 -; GFX11-FLUSH-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 -; GFX11-FLUSH-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-FLUSH-NEXT: s_load_b32 s4, s[0:1], 0x8 +; GFX11-FLUSH-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-FLUSH-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: v_add_f16_e64 v0, |s4|, |s4| -; GFX11-FLUSH-NEXT: s_lshr_b32 s0, s0, 16 +; GFX11-FLUSH-NEXT: s_lshr_b32 s2, s2, 16 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-FLUSH-NEXT: v_add_f16_e32 v2, s0, v0 -; GFX11-FLUSH-NEXT: v_add_f16_e32 v0, s1, v0 -; GFX11-FLUSH-NEXT: global_store_b16 v1, v2, s[2:3] dlc +; GFX11-FLUSH-NEXT: v_add_f16_e32 v2, s2, v0 +; GFX11-FLUSH-NEXT: v_add_f16_e32 v0, s3, v0 +; GFX11-FLUSH-NEXT: global_store_b16 v1, v2, s[0:1] dlc ; GFX11-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FLUSH-NEXT: global_store_b16 v1, v0, s[2:3] offset:2 dlc +; GFX11-FLUSH-NEXT: global_store_b16 v1, v0, s[0:1] offset:2 dlc ; GFX11-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FLUSH-NEXT: s_nop 0 ; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -873,8 +873,8 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou define amdgpu_kernel void @fmul_x2_xn2_f16(ptr addrspace(1) %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 { ; VI-LABEL: fmul_x2_xn2_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mul_f16_e64 v0, s2, -4.0 ; VI-NEXT: v_mul_f16_e32 v2, s2, v0 @@ -887,8 +887,8 @@ define amdgpu_kernel void @fmul_x2_xn2_f16(ptr addrspace(1) %out, i16 zeroext %x ; GFX10-LABEL: fmul_x2_xn2_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mul_f16_e64 v0, s2, -4.0 @@ -900,13 +900,13 @@ define amdgpu_kernel void @fmul_x2_xn2_f16(ptr addrspace(1) %out, i16 zeroext %x ; GFX11-LABEL: fmul_x2_xn2_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mul_f16_e64 v0, s4, -4.0 +; GFX11-NEXT: v_mul_f16_e64 v0, s2, -4.0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mul_f16_e32 v0, s4, v0 +; GFX11-NEXT: v_mul_f16_e32 v0, s2, v0 ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_nop 0 @@ -925,8 +925,8 @@ define amdgpu_kernel void @fmul_x2_xn2_f16(ptr addrspace(1) %out, i16 zeroext %x define amdgpu_kernel void @fmul_x2_xn3_f16(ptr addrspace(1) %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 { ; VI-LABEL: fmul_x2_xn3_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0xc600 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mul_f16_e32 v0, s2, v0 @@ -940,8 +940,8 @@ define amdgpu_kernel void @fmul_x2_xn3_f16(ptr addrspace(1) %out, i16 zeroext %x ; GFX10-LABEL: fmul_x2_xn3_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mul_f16_e64 v0, 0xc600, s2 @@ -953,13 +953,13 @@ define amdgpu_kernel void @fmul_x2_xn3_f16(ptr addrspace(1) %out, i16 zeroext %x ; GFX11-LABEL: fmul_x2_xn3_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mul_f16_e64 v0, 0xc600, s4 +; GFX11-NEXT: v_mul_f16_e64 v0, 0xc600, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mul_f16_e32 v0, s4, v0 +; GFX11-NEXT: v_mul_f16_e32 v0, s2, v0 ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_nop 0 diff --git a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll index 9300dfcb16e8a..9943976dd86da 100644 --- a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll @@ -22,7 +22,7 @@ declare half @llvm.fabs.f16(half) #1 define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; VI-FLUSH-LABEL: fmuladd_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s2 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 @@ -42,7 +42,7 @@ define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-DENORM-LABEL: fmuladd_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-DENORM-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s2 ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s3 @@ -62,7 +62,7 @@ define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX10-FLUSH-LABEL: fmuladd_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: s_clause 0x2 @@ -78,7 +78,7 @@ define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX10-DENORM-LABEL: fmuladd_f16: ; GFX10-DENORM: ; %bb.0: -; GFX10-DENORM-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-DENORM-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: s_clause 0x2 @@ -92,7 +92,7 @@ define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX11-FLUSH-LABEL: fmuladd_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-FLUSH-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: s_clause 0x2 @@ -111,7 +111,7 @@ define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX11-DENORM-LABEL: fmuladd_f16: ; GFX11-DENORM: ; %bb.0: -; GFX11-DENORM-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-DENORM-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-DENORM-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-NEXT: s_clause 0x2 @@ -136,7 +136,7 @@ define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; VI-FLUSH-LABEL: fmul_fadd_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s2 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 @@ -156,7 +156,7 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; VI-DENORM-CONTRACT-LABEL: fmul_fadd_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v0, s2 ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3 @@ -176,7 +176,7 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX10-FLUSH-LABEL: fmul_fadd_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: s_clause 0x2 @@ -192,7 +192,7 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX10-DENORM-STRICT-LABEL: fmul_fadd_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-STRICT-NEXT: s_clause 0x2 @@ -208,7 +208,7 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX10-DENORM-CONTRACT-LABEL: fmul_fadd_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: s_clause 0x2 @@ -222,7 +222,7 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX11-FLUSH-LABEL: fmul_fadd_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-FLUSH-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: s_clause 0x2 @@ -241,7 +241,7 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX11-DENORM-STRICT-LABEL: fmul_fadd_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-DENORM-STRICT-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-DENORM-STRICT-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-STRICT-NEXT: s_clause 0x2 @@ -260,7 +260,7 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX11-DENORM-CONTRACT-LABEL: fmul_fadd_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: s_clause 0x2 @@ -286,7 +286,7 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; VI-FLUSH-LABEL: fmul_fadd_contract_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s2 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 @@ -306,7 +306,7 @@ define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr add ; ; VI-DENORM-LABEL: fmul_fadd_contract_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-DENORM-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s2 ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s3 @@ -326,7 +326,7 @@ define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr add ; ; GFX10-FLUSH-LABEL: fmul_fadd_contract_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: s_clause 0x2 @@ -342,7 +342,7 @@ define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr add ; ; GFX10-DENORM-LABEL: fmul_fadd_contract_f16: ; GFX10-DENORM: ; %bb.0: -; GFX10-DENORM-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-DENORM-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: s_clause 0x2 @@ -356,7 +356,7 @@ define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr add ; ; GFX11-FLUSH-LABEL: fmul_fadd_contract_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-FLUSH-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: s_clause 0x2 @@ -375,7 +375,7 @@ define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr add ; ; GFX11-DENORM-LABEL: fmul_fadd_contract_f16: ; GFX11-DENORM: ; %bb.0: -; GFX11-DENORM-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-DENORM-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-DENORM-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-NEXT: s_clause 0x2 @@ -401,7 +401,7 @@ define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; VI-FLUSH-LABEL: fmuladd_2.0_a_b_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 @@ -419,7 +419,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrsp ; ; VI-DENORM-LABEL: fmuladd_2.0_a_b_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1 @@ -437,7 +437,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-FLUSH-LABEL: fmuladd_2.0_a_b_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -451,7 +451,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-DENORM-LABEL: fmuladd_2.0_a_b_f16: ; GFX10-DENORM: ; %bb.0: -; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -464,9 +464,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-FLUSH-LABEL: fmuladd_2.0_a_b_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -483,9 +481,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-DENORM-LABEL: fmuladd_2.0_a_b_f16: ; GFX11-DENORM: ; %bb.0: -; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-DENORM-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -513,7 +509,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; VI-FLUSH-LABEL: fmuladd_a_2.0_b_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 @@ -531,7 +527,7 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrsp ; ; VI-DENORM-LABEL: fmuladd_a_2.0_b_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1 @@ -549,7 +545,7 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-FLUSH-LABEL: fmuladd_a_2.0_b_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -563,7 +559,7 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-DENORM-LABEL: fmuladd_a_2.0_b_f16: ; GFX10-DENORM: ; %bb.0: -; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -576,9 +572,7 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-FLUSH-LABEL: fmuladd_a_2.0_b_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -595,9 +589,7 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-DENORM-LABEL: fmuladd_a_2.0_b_f16: ; GFX11-DENORM: ; %bb.0: -; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-DENORM-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -625,7 +617,7 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out, ; VI-FLUSH-LABEL: fadd_a_a_b_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 @@ -643,7 +635,7 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out, ; ; VI-DENORM-CONTRACT-LABEL: fadd_a_a_b_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s1 @@ -661,7 +653,7 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out, ; ; GFX10-FLUSH-LABEL: fadd_a_a_b_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -675,7 +667,7 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out, ; ; GFX10-DENORM-STRICT-LABEL: fadd_a_a_b_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -689,7 +681,7 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out, ; ; GFX10-DENORM-CONTRACT-LABEL: fadd_a_a_b_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -702,9 +694,7 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out, ; ; GFX11-FLUSH-LABEL: fadd_a_a_b_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -721,9 +711,7 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out, ; ; GFX11-DENORM-STRICT-LABEL: fadd_a_a_b_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -740,9 +728,7 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out, ; ; GFX11-DENORM-CONTRACT-LABEL: fadd_a_a_b_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -773,7 +759,7 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out, define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out, ; VI-FLUSH-LABEL: fadd_b_a_a_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 @@ -791,7 +777,7 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out, ; ; VI-DENORM-CONTRACT-LABEL: fadd_b_a_a_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s1 @@ -809,7 +795,7 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out, ; ; GFX10-FLUSH-LABEL: fadd_b_a_a_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -823,7 +809,7 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out, ; ; GFX10-DENORM-STRICT-LABEL: fadd_b_a_a_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -837,7 +823,7 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out, ; ; GFX10-DENORM-CONTRACT-LABEL: fadd_b_a_a_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -850,9 +836,7 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out, ; ; GFX11-FLUSH-LABEL: fadd_b_a_a_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -869,9 +853,7 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out, ; ; GFX11-DENORM-STRICT-LABEL: fadd_b_a_a_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -888,9 +870,7 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out, ; ; GFX11-DENORM-CONTRACT-LABEL: fadd_b_a_a_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -921,7 +901,7 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out, define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; VI-FLUSH-LABEL: fmuladd_neg_2.0_a_b_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 @@ -939,7 +919,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr ad ; ; VI-DENORM-LABEL: fmuladd_neg_2.0_a_b_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1 @@ -957,7 +937,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX10-FLUSH-LABEL: fmuladd_neg_2.0_a_b_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -971,7 +951,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX10-DENORM-LABEL: fmuladd_neg_2.0_a_b_f16: ; GFX10-DENORM: ; %bb.0: -; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -984,9 +964,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX11-FLUSH-LABEL: fmuladd_neg_2.0_a_b_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -1003,9 +981,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX11-DENORM-LABEL: fmuladd_neg_2.0_a_b_f16: ; GFX11-DENORM: ; %bb.0: -; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-DENORM-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -1033,7 +1009,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; VI-FLUSH-LABEL: fmuladd_neg_2.0_neg_a_b_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 @@ -1051,7 +1027,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, pt ; ; VI-DENORM-LABEL: fmuladd_neg_2.0_neg_a_b_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1 @@ -1069,7 +1045,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, pt ; ; GFX10-FLUSH-LABEL: fmuladd_neg_2.0_neg_a_b_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -1083,7 +1059,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, pt ; ; GFX10-DENORM-LABEL: fmuladd_neg_2.0_neg_a_b_f16: ; GFX10-DENORM: ; %bb.0: -; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -1096,9 +1072,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, pt ; ; GFX11-FLUSH-LABEL: fmuladd_neg_2.0_neg_a_b_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -1115,9 +1089,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, pt ; ; GFX11-DENORM-LABEL: fmuladd_neg_2.0_neg_a_b_f16: ; GFX11-DENORM: ; %bb.0: -; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-DENORM-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -1147,7 +1119,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, pt define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; VI-FLUSH-LABEL: fmuladd_2.0_neg_a_b_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 @@ -1165,7 +1137,7 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr ad ; ; VI-DENORM-LABEL: fmuladd_2.0_neg_a_b_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1 @@ -1183,7 +1155,7 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX10-FLUSH-LABEL: fmuladd_2.0_neg_a_b_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -1197,7 +1169,7 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX10-DENORM-LABEL: fmuladd_2.0_neg_a_b_f16: ; GFX10-DENORM: ; %bb.0: -; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -1210,9 +1182,7 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX11-FLUSH-LABEL: fmuladd_2.0_neg_a_b_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -1229,9 +1199,7 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX11-DENORM-LABEL: fmuladd_2.0_neg_a_b_f16: ; GFX11-DENORM: ; %bb.0: -; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-DENORM-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -1261,7 +1229,7 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; VI-FLUSH-LABEL: fmuladd_2.0_a_neg_b_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 @@ -1279,7 +1247,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr ad ; ; VI-DENORM-LABEL: fmuladd_2.0_a_neg_b_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1 @@ -1297,7 +1265,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX10-FLUSH-LABEL: fmuladd_2.0_a_neg_b_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -1311,7 +1279,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX10-DENORM-LABEL: fmuladd_2.0_a_neg_b_f16: ; GFX10-DENORM: ; %bb.0: -; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -1324,9 +1292,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX11-FLUSH-LABEL: fmuladd_2.0_a_neg_b_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -1343,9 +1309,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX11-DENORM-LABEL: fmuladd_2.0_a_neg_b_f16: ; GFX11-DENORM: ; %bb.0: -; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-DENORM-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -1375,7 +1339,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 { ; VI-FLUSH-LABEL: mad_sub_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 @@ -1400,7 +1364,7 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out, ; ; VI-DENORM-CONTRACT-LABEL: mad_sub_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3 @@ -1425,56 +1389,54 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out, ; ; GFX10-FLUSH-LABEL: mad_sub_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) ; GFX10-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX10-FLUSH-NEXT: v_sub_f16_e32 v1, v1, v3 -; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-FLUSH-NEXT: s_endpgm ; ; GFX10-DENORM-STRICT-LABEL: mad_sub_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-STRICT-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX10-DENORM-STRICT-NEXT: v_sub_f16_e32 v1, v1, v3 -; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-DENORM-STRICT-NEXT: s_endpgm ; ; GFX10-DENORM-CONTRACT-LABEL: mad_sub_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: v_fma_f16 v1, v1, v2, -v3 -; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-DENORM-CONTRACT-NEXT: s_endpgm ; ; GFX11-FLUSH-LABEL: mad_sub_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -1493,9 +1455,7 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out, ; ; GFX11-DENORM-STRICT-LABEL: mad_sub_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -1514,9 +1474,7 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out, ; ; GFX11-DENORM-CONTRACT-LABEL: mad_sub_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -1550,7 +1508,7 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out, define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 { ; VI-FLUSH-LABEL: mad_sub_inv_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 @@ -1575,7 +1533,7 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o ; ; VI-DENORM-CONTRACT-LABEL: mad_sub_inv_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3 @@ -1600,56 +1558,54 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o ; ; GFX10-FLUSH-LABEL: mad_sub_inv_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) ; GFX10-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX10-FLUSH-NEXT: v_sub_f16_e32 v1, v3, v1 -; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-FLUSH-NEXT: s_endpgm ; ; GFX10-DENORM-STRICT-LABEL: mad_sub_inv_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-STRICT-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX10-DENORM-STRICT-NEXT: v_sub_f16_e32 v1, v3, v1 -; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-DENORM-STRICT-NEXT: s_endpgm ; ; GFX10-DENORM-CONTRACT-LABEL: mad_sub_inv_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: v_fma_f16 v1, -v1, v2, v3 -; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-DENORM-CONTRACT-NEXT: s_endpgm ; ; GFX11-FLUSH-LABEL: mad_sub_inv_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -1668,9 +1624,7 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o ; ; GFX11-DENORM-STRICT-LABEL: mad_sub_inv_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -1689,9 +1643,7 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o ; ; GFX11-DENORM-CONTRACT-LABEL: mad_sub_inv_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -1725,7 +1677,7 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 { ; VI-FLUSH-LABEL: mad_sub_fabs_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 @@ -1750,7 +1702,7 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture % ; ; VI-DENORM-CONTRACT-LABEL: mad_sub_fabs_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3 @@ -1775,56 +1727,54 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture % ; ; GFX10-FLUSH-LABEL: mad_sub_fabs_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) ; GFX10-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX10-FLUSH-NEXT: v_sub_f16_e64 v1, v1, |v3| -; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-FLUSH-NEXT: s_endpgm ; ; GFX10-DENORM-STRICT-LABEL: mad_sub_fabs_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-STRICT-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX10-DENORM-STRICT-NEXT: v_sub_f16_e64 v1, v1, |v3| -; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-DENORM-STRICT-NEXT: s_endpgm ; ; GFX10-DENORM-CONTRACT-LABEL: mad_sub_fabs_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: v_fma_f16 v1, v1, v2, -|v3| -; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-DENORM-CONTRACT-NEXT: s_endpgm ; ; GFX11-FLUSH-LABEL: mad_sub_fabs_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -1843,9 +1793,7 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture % ; ; GFX11-DENORM-STRICT-LABEL: mad_sub_fabs_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -1864,9 +1812,7 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture % ; ; GFX11-DENORM-CONTRACT-LABEL: mad_sub_fabs_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -1901,7 +1847,7 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture % define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 { ; VI-FLUSH-LABEL: mad_sub_fabs_inv_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 @@ -1926,7 +1872,7 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu ; ; VI-DENORM-CONTRACT-LABEL: mad_sub_fabs_inv_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3 @@ -1951,56 +1897,54 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu ; ; GFX10-FLUSH-LABEL: mad_sub_fabs_inv_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) ; GFX10-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX10-FLUSH-NEXT: v_sub_f16_e64 v1, |v3|, v1 -; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-FLUSH-NEXT: s_endpgm ; ; GFX10-DENORM-STRICT-LABEL: mad_sub_fabs_inv_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-STRICT-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX10-DENORM-STRICT-NEXT: v_sub_f16_e64 v1, |v3|, v1 -; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-DENORM-STRICT-NEXT: s_endpgm ; ; GFX10-DENORM-CONTRACT-LABEL: mad_sub_fabs_inv_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: v_fma_f16 v1, -v1, v2, |v3| -; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-DENORM-CONTRACT-NEXT: s_endpgm ; ; GFX11-FLUSH-LABEL: mad_sub_fabs_inv_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -2019,9 +1963,7 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu ; ; GFX11-DENORM-STRICT-LABEL: mad_sub_fabs_inv_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -2040,9 +1982,7 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu ; ; GFX11-DENORM-CONTRACT-LABEL: mad_sub_fabs_inv_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -2077,7 +2017,7 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 { ; VI-FLUSH-LABEL: neg_neg_mad_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 @@ -2102,7 +2042,7 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o ; ; VI-DENORM-CONTRACT-LABEL: neg_neg_mad_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3 @@ -2127,56 +2067,54 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o ; ; GFX10-FLUSH-LABEL: neg_neg_mad_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) ; GFX10-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v3, v1 -; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-FLUSH-NEXT: s_endpgm ; ; GFX10-DENORM-STRICT-LABEL: neg_neg_mad_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-STRICT-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX10-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v3, v1 -; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-DENORM-STRICT-NEXT: s_endpgm ; ; GFX10-DENORM-CONTRACT-LABEL: neg_neg_mad_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: v_fmac_f16_e32 v3, v1, v2 -; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v3, s[0:1] +; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v3, s[4:5] ; GFX10-DENORM-CONTRACT-NEXT: s_endpgm ; ; GFX11-FLUSH-LABEL: neg_neg_mad_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -2195,9 +2133,7 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o ; ; GFX11-DENORM-STRICT-LABEL: neg_neg_mad_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -2216,9 +2152,7 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o ; ; GFX11-DENORM-CONTRACT-LABEL: neg_neg_mad_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -2254,7 +2188,7 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 { ; VI-FLUSH-LABEL: mad_fabs_sub_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 @@ -2279,7 +2213,7 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture % ; ; VI-DENORM-CONTRACT-LABEL: mad_fabs_sub_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3 @@ -2304,56 +2238,54 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture % ; ; GFX10-FLUSH-LABEL: mad_fabs_sub_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) ; GFX10-FLUSH-NEXT: v_mul_f16_e64 v1, v1, |v2| ; GFX10-FLUSH-NEXT: v_sub_f16_e32 v1, v1, v3 -; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-FLUSH-NEXT: s_endpgm ; ; GFX10-DENORM-STRICT-LABEL: mad_fabs_sub_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-STRICT-NEXT: v_mul_f16_e64 v1, v1, |v2| ; GFX10-DENORM-STRICT-NEXT: v_sub_f16_e32 v1, v1, v3 -; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-DENORM-STRICT-NEXT: s_endpgm ; ; GFX10-DENORM-CONTRACT-LABEL: mad_fabs_sub_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: v_fma_f16 v1, v1, |v2|, -v3 -; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-DENORM-CONTRACT-NEXT: s_endpgm ; ; GFX11-FLUSH-LABEL: mad_fabs_sub_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -2372,9 +2304,7 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture % ; ; GFX11-DENORM-STRICT-LABEL: mad_fabs_sub_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -2393,9 +2323,7 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture % ; ; GFX11-DENORM-CONTRACT-LABEL: mad_fabs_sub_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -2430,7 +2358,7 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture % define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; VI-FLUSH-LABEL: fsub_c_fadd_a_a_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 @@ -2448,7 +2376,7 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp ; ; VI-DENORM-CONTRACT-LABEL: fsub_c_fadd_a_a_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s1 @@ -2466,7 +2394,7 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-FLUSH-LABEL: fsub_c_fadd_a_a_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -2480,7 +2408,7 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-DENORM-STRICT-LABEL: fsub_c_fadd_a_a_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -2494,7 +2422,7 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-DENORM-CONTRACT-LABEL: fsub_c_fadd_a_a_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -2507,9 +2435,7 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-FLUSH-LABEL: fsub_c_fadd_a_a_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -2526,9 +2452,7 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-DENORM-STRICT-LABEL: fsub_c_fadd_a_a_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -2545,9 +2469,7 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-DENORM-CONTRACT-LABEL: fsub_c_fadd_a_a_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -2577,7 +2499,7 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; VI-FLUSH-LABEL: fsub_fadd_a_a_c_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 @@ -2595,7 +2517,7 @@ define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrsp ; ; VI-DENORM-CONTRACT-LABEL: fsub_fadd_a_a_c_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s1 @@ -2613,7 +2535,7 @@ define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-FLUSH-LABEL: fsub_fadd_a_a_c_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -2627,7 +2549,7 @@ define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-DENORM-STRICT-LABEL: fsub_fadd_a_a_c_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -2641,7 +2563,7 @@ define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-DENORM-CONTRACT-LABEL: fsub_fadd_a_a_c_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -2654,9 +2576,7 @@ define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-FLUSH-LABEL: fsub_fadd_a_a_c_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -2673,9 +2593,7 @@ define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-DENORM-STRICT-LABEL: fsub_fadd_a_a_c_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -2692,9 +2610,7 @@ define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-DENORM-CONTRACT-LABEL: fsub_fadd_a_a_c_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc diff --git a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll index ba8b6fb80518f..f411a76e75ab6 100644 --- a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll +++ b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll @@ -15,8 +15,8 @@ declare <4 x double> @llvm.nearbyint.v4f64(<4 x double>) #0 define amdgpu_kernel void @fnearbyint_f16(ptr addrspace(1) %out, half %in) #1 { ; SI-LABEL: fnearbyint_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -28,24 +28,23 @@ define amdgpu_kernel void @fnearbyint_f16(ptr addrspace(1) %out, half %in) #1 { ; ; CI-LABEL: fnearbyint_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[2:3], 0xb -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dword s2, s[0:1], 0xb +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_rndne_f32_e32 v0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: fnearbyint_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_rndne_f16_e32 v2, s4 +; VI-NEXT: v_rndne_f16_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_short v[0:1], v2 @@ -54,11 +53,11 @@ define amdgpu_kernel void @fnearbyint_f16(ptr addrspace(1) %out, half %in) #1 { ; GFX11-LABEL: fnearbyint_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_rndne_f16_e32 v1, s4 +; GFX11-NEXT: v_rndne_f16_e32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -71,8 +70,8 @@ define amdgpu_kernel void @fnearbyint_f16(ptr addrspace(1) %out, half %in) #1 { define amdgpu_kernel void @fnearbyint_f32(ptr addrspace(1) %out, float %in) #1 { ; SICI-LABEL: fnearbyint_f32: ; SICI: ; %bb.0: ; %entry -; SICI-NEXT: s_load_dword s4, s[2:3], 0xb -; SICI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SICI-NEXT: s_load_dword s4, s[0:1], 0xb +; SICI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SICI-NEXT: s_mov_b32 s3, 0xf000 ; SICI-NEXT: s_mov_b32 s2, -1 ; SICI-NEXT: s_waitcnt lgkmcnt(0) @@ -82,10 +81,10 @@ define amdgpu_kernel void @fnearbyint_f32(ptr addrspace(1) %out, float %in) #1 { ; ; VI-LABEL: fnearbyint_f32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_rndne_f32_e32 v2, s4 +; VI-NEXT: v_rndne_f32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -94,11 +93,11 @@ define amdgpu_kernel void @fnearbyint_f32(ptr addrspace(1) %out, float %in) #1 { ; GFX11-LABEL: fnearbyint_f32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_rndne_f32_e32 v1, s4 +; GFX11-NEXT: v_rndne_f32_e32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -112,7 +111,7 @@ entry: define amdgpu_kernel void @fnearbyint_v2f32(ptr addrspace(1) %out, <2 x float> %in) #1 { ; SICI-LABEL: fnearbyint_v2f32: ; SICI: ; %bb.0: ; %entry -; SICI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SICI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SICI-NEXT: s_mov_b32 s7, 0xf000 ; SICI-NEXT: s_mov_b32 s6, -1 ; SICI-NEXT: s_waitcnt lgkmcnt(0) @@ -125,7 +124,7 @@ define amdgpu_kernel void @fnearbyint_v2f32(ptr addrspace(1) %out, <2 x float> % ; ; VI-LABEL: fnearbyint_v2f32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_rndne_f32_e32 v1, s3 @@ -136,7 +135,7 @@ define amdgpu_kernel void @fnearbyint_v2f32(ptr addrspace(1) %out, <2 x float> % ; ; GFX11-LABEL: fnearbyint_v2f32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rndne_f32_e32 v1, s3 @@ -154,8 +153,8 @@ entry: define amdgpu_kernel void @fnearbyint_v4f32(ptr addrspace(1) %out, <4 x float> %in) #1 { ; SICI-LABEL: fnearbyint_v4f32: ; SICI: ; %bb.0: ; %entry -; SICI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SICI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SICI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SICI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SICI-NEXT: s_mov_b32 s3, 0xf000 ; SICI-NEXT: s_mov_b32 s2, -1 ; SICI-NEXT: s_waitcnt lgkmcnt(0) @@ -168,8 +167,8 @@ define amdgpu_kernel void @fnearbyint_v4f32(ptr addrspace(1) %out, <4 x float> % ; ; VI-LABEL: fnearbyint_v4f32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_rndne_f32_e32 v3, s7 ; VI-NEXT: v_mov_b32_e32 v5, s1 @@ -183,8 +182,8 @@ define amdgpu_kernel void @fnearbyint_v4f32(ptr addrspace(1) %out, <4 x float> % ; GFX11-LABEL: fnearbyint_v4f32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rndne_f32_e32 v3, s7 @@ -204,7 +203,7 @@ entry: define amdgpu_kernel void @nearbyint_f64(ptr addrspace(1) %out, double %in) { ; SI-LABEL: nearbyint_f64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_brev_b32 s8, -2 @@ -228,7 +227,7 @@ define amdgpu_kernel void @nearbyint_f64(ptr addrspace(1) %out, double %in) { ; ; CI-LABEL: nearbyint_f64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_rndne_f64_e32 v[0:1], s[2:3] ; CI-NEXT: s_mov_b32 s3, 0xf000 @@ -238,7 +237,7 @@ define amdgpu_kernel void @nearbyint_f64(ptr addrspace(1) %out, double %in) { ; ; VI-LABEL: nearbyint_f64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_rndne_f64_e32 v[0:1], s[2:3] ; VI-NEXT: v_mov_b32_e32 v2, s0 @@ -248,7 +247,7 @@ define amdgpu_kernel void @nearbyint_f64(ptr addrspace(1) %out, double %in) { ; ; GFX11-LABEL: nearbyint_f64: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rndne_f64_e32 v[0:1], s[2:3] @@ -264,41 +263,41 @@ entry: define amdgpu_kernel void @nearbyint_v2f64(ptr addrspace(1) %out, <2 x double> %in) { ; SI-LABEL: nearbyint_v2f64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_brev_b32 s10, -2 ; SI-NEXT: v_mov_b32_e32 v6, 0x43300000 ; SI-NEXT: s_mov_b32 s9, 0x432fffff ; SI-NEXT: v_mov_b32_e32 v0, 0 -; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s8, s6 ; SI-NEXT: v_mov_b32_e32 v4, s8 ; SI-NEXT: v_mov_b32_e32 v5, s9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v7, s7 +; SI-NEXT: v_mov_b32_e32 v7, s3 ; SI-NEXT: v_bfi_b32 v1, s10, v6, v7 -; SI-NEXT: v_mov_b32_e32 v8, s6 -; SI-NEXT: v_mov_b32_e32 v9, s5 -; SI-NEXT: v_mov_b32_e32 v10, s4 -; SI-NEXT: v_add_f64 v[2:3], s[6:7], v[0:1] +; SI-NEXT: v_mov_b32_e32 v8, s2 +; SI-NEXT: v_mov_b32_e32 v9, s1 +; SI-NEXT: v_mov_b32_e32 v10, s0 +; SI-NEXT: v_add_f64 v[2:3], s[2:3], v[0:1] ; SI-NEXT: v_add_f64 v[2:3], v[2:3], -v[0:1] ; SI-NEXT: v_bfi_b32 v1, s10, v6, v9 -; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[6:7]|, v[4:5] +; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[2:3]|, v[4:5] ; SI-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc ; SI-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc -; SI-NEXT: v_add_f64 v[6:7], s[4:5], v[0:1] +; SI-NEXT: v_add_f64 v[6:7], s[0:1], v[0:1] ; SI-NEXT: v_add_f64 v[0:1], v[6:7], -v[0:1] -; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[4:5]|, v[4:5] +; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[0:1]|, v[4:5] ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: nearbyint_v2f64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -309,8 +308,8 @@ define amdgpu_kernel void @nearbyint_v2f64(ptr addrspace(1) %out, <2 x double> % ; ; VI-LABEL: nearbyint_v2f64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_rndne_f64_e32 v[2:3], s[6:7] ; VI-NEXT: v_rndne_f64_e32 v[0:1], s[4:5] @@ -322,8 +321,8 @@ define amdgpu_kernel void @nearbyint_v2f64(ptr addrspace(1) %out, <2 x double> % ; GFX11-LABEL: nearbyint_v2f64: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rndne_f64_e32 v[2:3], s[6:7] @@ -341,8 +340,8 @@ entry: define amdgpu_kernel void @nearbyint_v4f64(ptr addrspace(1) %out, <4 x double> %in) { ; SI-LABEL: nearbyint_v4f64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x11 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x11 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_brev_b32 s14, -2 @@ -391,8 +390,8 @@ define amdgpu_kernel void @nearbyint_v4f64(ptr addrspace(1) %out, <4 x double> % ; ; CI-LABEL: nearbyint_v4f64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x11 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -406,8 +405,8 @@ define amdgpu_kernel void @nearbyint_v4f64(ptr addrspace(1) %out, <4 x double> % ; ; VI-LABEL: nearbyint_v4f64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_rndne_f64_e32 v[6:7], s[10:11] ; VI-NEXT: v_rndne_f64_e32 v[4:5], s[8:9] @@ -426,8 +425,8 @@ define amdgpu_kernel void @nearbyint_v4f64(ptr addrspace(1) %out, <4 x double> % ; GFX11-LABEL: nearbyint_v4f64: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x44 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x44 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v8, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rndne_f64_e32 v[6:7], s[10:11] diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll index 74e2b9ea71425..b5440b9c38c9f 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll @@ -2799,7 +2799,7 @@ define <2 x half> @fadd_select_fneg_fneg_v2f16(i32 %arg0, <2 x half> %x, <2 x ha define amdgpu_kernel void @s_fneg_select_infloop_regression_f32(float %arg, i1 %arg1, ptr addrspace(1) %ptr) { ; SI-LABEL: s_fneg_select_infloop_regression_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bitcmp1_b32 s1, 0 ; SI-NEXT: v_mov_b32_e32 v0, s0 @@ -2813,7 +2813,7 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_f32(float %arg, i1 % ; ; VI-LABEL: s_fneg_select_infloop_regression_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitcmp1_b32 s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -3016,41 +3016,41 @@ define float @v_fneg_select_infloop_regression_neg_inline_imm_f32_commute2(float define amdgpu_kernel void @s_fneg_select_infloop_regression_f64(double %arg, i1 %arg1, ptr addrspace(1) %ptr) { ; SI-LABEL: s_fneg_select_infloop_regression_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0xd +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: v_bfrev_b32_e32 v0, 1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bitcmp1_b32 s4, 0 ; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 -; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_mov_b32_e32 v1, s3 ; SI-NEXT: s_and_b64 s[6:7], s[4:5], exec ; SI-NEXT: v_cndmask_b32_e64 v0, -v1, v0, s[4:5] -; SI-NEXT: s_cselect_b32 s0, 0, s0 -; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: s_cselect_b32 s2, 0, s2 +; SI-NEXT: v_mov_b32_e32 v3, s1 ; SI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5] -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v3, s3 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v2, s0 ; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_fneg_select_infloop_regression_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x34 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_bfrev_b32_e32 v0, 1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitcmp1_b32 s4, 0 ; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: s_and_b64 s[6:7], s[4:5], exec ; VI-NEXT: v_cndmask_b32_e64 v0, -v1, v0, s[4:5] -; VI-NEXT: s_cselect_b32 s0, 0, s0 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_cselect_b32 s2, 0, s2 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5] -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm %i = select i1 %arg1, double 0.0, double %arg @@ -3080,11 +3080,11 @@ define double @v_fneg_select_infloop_regression_f64(double %arg, i1 %arg1) { define amdgpu_kernel void @s_fneg_select_infloop_regression_f16(half %arg, i1 %arg1, ptr addrspace(1) %ptr) { ; SI-LABEL: s_fneg_select_infloop_regression_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; SI-NEXT: s_load_dword s2, s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_bitcmp1_b32 s4, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; SI-NEXT: s_bitcmp1_b32 s2, 16 ; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[2:3] ; SI-NEXT: v_cndmask_b32_e64 v0, -v0, 0, s[2:3] @@ -3096,11 +3096,11 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_f16(half %arg, i1 %a ; ; VI-LABEL: s_fneg_select_infloop_regression_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bitcmp1_b32 s4, 16 -; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: s_bitcmp1_b32 s2, 16 +; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 ; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[2:3] ; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v0 @@ -3146,7 +3146,7 @@ define half @v_fneg_select_infloop_regression_f16(half %arg, i1 %arg1) { define amdgpu_kernel void @s_fneg_select_infloop_regression_v2f16(<2 x half> %arg, i1 %arg1, ptr addrspace(1) %ptr) { ; SI-LABEL: s_fneg_select_infloop_regression_v2f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_and_b32 s1, 1, s1 ; SI-NEXT: s_cselect_b32 s0, 0, s0 @@ -3161,7 +3161,7 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_v2f16(<2 x half> %ar ; ; VI-LABEL: s_fneg_select_infloop_regression_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s1, 1, s1 ; VI-NEXT: s_cselect_b32 s0, 0, s0 @@ -3216,8 +3216,8 @@ define <2 x half> @v_fneg_select_infloop_regression_v2f16(<2 x half> %arg, i1 %a define amdgpu_kernel void @s_fneg_select_infloop_regression_v2f32(<2 x float> %arg, i1 %arg1, ptr addrspace(1) %ptr) { ; SI-LABEL: s_fneg_select_infloop_regression_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: v_bfrev_b32_e32 v0, 1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bitcmp1_b32 s6, 0 @@ -3235,8 +3235,8 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_v2f32(<2 x float> %a ; ; VI-LABEL: s_fneg_select_infloop_regression_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_bfrev_b32_e32 v0, 1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitcmp1_b32 s6, 0 @@ -3279,7 +3279,7 @@ define <2 x float> @v_fneg_select_infloop_regression_v2f32(<2 x float> %arg, i1 define amdgpu_kernel void @s_fabs_select_infloop_regression_f32(float %arg, i1 %arg1, ptr addrspace(1) %ptr) { ; SI-LABEL: s_fabs_select_infloop_regression_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bitcmp1_b32 s1, 0 ; SI-NEXT: v_mov_b32_e32 v0, s0 @@ -3293,7 +3293,7 @@ define amdgpu_kernel void @s_fabs_select_infloop_regression_f32(float %arg, i1 % ; ; VI-LABEL: s_fabs_select_infloop_regression_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitcmp1_b32 s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -3329,7 +3329,7 @@ define float @v_fabs_select_infloop_regression_f32(float %arg, i1 %arg1) { define amdgpu_kernel void @s_fneg_fabs_select_infloop_regression(float %arg, i1 %arg1, ptr addrspace(1) %ptr) { ; SI-LABEL: s_fneg_fabs_select_infloop_regression: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bitcmp1_b32 s1, 0 ; SI-NEXT: v_mov_b32_e32 v0, s0 @@ -3343,7 +3343,7 @@ define amdgpu_kernel void @s_fneg_fabs_select_infloop_regression(float %arg, i1 ; ; VI-LABEL: s_fneg_fabs_select_infloop_regression: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitcmp1_b32 s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll index 8267bb9f5450f..4364b32e62f8c 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll @@ -7,12 +7,12 @@ define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, half %y) { ; CI-LABEL: fneg_fabs_fadd_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[6:7], 0x2 +; CI-NEXT: s_load_dword s0, s[4:5], 0x2 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e64 v0, |s0| ; CI-NEXT: s_lshr_b32 s0, s0, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_sub_f32_e32 v0, v1, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -23,8 +23,8 @@ define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, ha ; ; VI-LABEL: fneg_fabs_fadd_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -36,8 +36,8 @@ define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, ha ; ; GFX9-LABEL: fneg_fabs_fadd_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s3, s2, 16 @@ -49,13 +49,13 @@ define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, ha ; GFX11-LABEL: fneg_fabs_fadd_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s2, s4, 16 +; GFX11-NEXT: s_lshr_b32 s3, s2, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_sub_f16_e64 v1, s2, |s4| +; GFX11-NEXT: v_sub_f16_e64 v1, s3, |s2| ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -70,13 +70,13 @@ define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, ha define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, half %y) { ; CI-LABEL: fneg_fabs_fmul_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[6:7], 0x2 +; CI-NEXT: s_load_dword s0, s[4:5], 0x2 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s1, s0, 0x7fff ; CI-NEXT: s_lshr_b32 s0, s0, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 ; CI-NEXT: v_cvt_f32_f16_e64 v1, -|s1| -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mul_f32_e32 v0, v0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -87,8 +87,8 @@ define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, ha ; ; VI-LABEL: fneg_fabs_fmul_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -100,8 +100,8 @@ define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, ha ; ; GFX9-LABEL: fneg_fabs_fmul_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s3, s2, 16 @@ -113,13 +113,13 @@ define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, ha ; GFX11-LABEL: fneg_fabs_fmul_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s2, s4, 16 +; GFX11-NEXT: s_lshr_b32 s3, s2, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mul_f16_e64 v1, s2, -|s4| +; GFX11-NEXT: v_mul_f16_e64 v1, s3, -|s2| ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -137,8 +137,8 @@ define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, ha define amdgpu_kernel void @fneg_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { ; CI-LABEL: fneg_fabs_free_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_bitset1_b32 s2, 15 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -149,8 +149,8 @@ define amdgpu_kernel void @fneg_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { ; ; VI-LABEL: fneg_fabs_free_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitset1_b32 s2, 15 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -161,8 +161,8 @@ define amdgpu_kernel void @fneg_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { ; ; GFX9-LABEL: fneg_fabs_free_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bitset1_b32 s2, 15 @@ -173,10 +173,10 @@ define amdgpu_kernel void @fneg_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { ; GFX11-LABEL: fneg_fabs_free_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_or_b32 s2, s4, 0x8000 +; GFX11-NEXT: s_bitset1_b32 s2, 15 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -193,8 +193,8 @@ define amdgpu_kernel void @fneg_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { define amdgpu_kernel void @fneg_fabs_f16(ptr addrspace(1) %out, half %in) { ; CI-LABEL: fneg_fabs_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_bitset1_b32 s2, 15 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -205,8 +205,8 @@ define amdgpu_kernel void @fneg_fabs_f16(ptr addrspace(1) %out, half %in) { ; ; VI-LABEL: fneg_fabs_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitset1_b32 s2, 15 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -217,8 +217,8 @@ define amdgpu_kernel void @fneg_fabs_f16(ptr addrspace(1) %out, half %in) { ; ; GFX9-LABEL: fneg_fabs_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bitset1_b32 s2, 15 @@ -229,10 +229,10 @@ define amdgpu_kernel void @fneg_fabs_f16(ptr addrspace(1) %out, half %in) { ; GFX11-LABEL: fneg_fabs_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_or_b32 s2, s4, 0x8000 +; GFX11-NEXT: s_bitset1_b32 s2, 15 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -248,7 +248,7 @@ define amdgpu_kernel void @fneg_fabs_f16(ptr addrspace(1) %out, half %in) { define amdgpu_kernel void @v_fneg_fabs_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; CIVI-LABEL: v_fneg_fabs_f16: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -262,7 +262,7 @@ define amdgpu_kernel void @v_fneg_fabs_f16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: v_fneg_fabs_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] @@ -273,7 +273,7 @@ define amdgpu_kernel void @v_fneg_fabs_f16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11-LABEL: v_fneg_fabs_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] @@ -293,12 +293,12 @@ define amdgpu_kernel void @v_fneg_fabs_f16(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, <2 x half> %in) { ; CI-LABEL: s_fneg_fabs_v2f16_non_bc_src: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[6:7], 0x2 +; CI-NEXT: s_load_dword s0, s[4:5], 0x2 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s1, s0, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s1 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_add_f32_e32 v1, 2.0, v1 ; CI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -314,8 +314,8 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, < ; ; VI-LABEL: s_fneg_fabs_v2f16_non_bc_src: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0x4000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 @@ -331,8 +331,8 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, < ; ; GFX9-LABEL: s_fneg_fabs_v2f16_non_bc_src: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40003c00 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -344,11 +344,11 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, < ; GFX11-LABEL: s_fneg_fabs_v2f16_non_bc_src: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_add_f16 v0, 0x40003c00, s4 +; GFX11-NEXT: v_pk_add_f16 v0, 0x40003c00, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_or_b32_e32 v0, 0x80008000, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -367,8 +367,8 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, < define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(ptr addrspace(1) %out, <2 x half> %in) { ; CI-LABEL: s_fneg_fabs_v2f16_bc_src: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_or_b32 s2, s2, 0x80008000 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -379,8 +379,8 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(ptr addrspace(1) %out, <2 x ; ; VI-LABEL: s_fneg_fabs_v2f16_bc_src: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_or_b32 s2, s2, 0x80008000 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -391,8 +391,8 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(ptr addrspace(1) %out, <2 x ; ; GFX9-LABEL: s_fneg_fabs_v2f16_bc_src: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_or_b32 s2, s2, 0x80008000 @@ -403,10 +403,10 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(ptr addrspace(1) %out, <2 x ; GFX11-LABEL: s_fneg_fabs_v2f16_bc_src: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_or_b32 s2, s4, 0x80008000 +; GFX11-NEXT: s_or_b32 s2, s2, 0x80008000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -422,7 +422,7 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(ptr addrspace(1) %out, <2 x define amdgpu_kernel void @fneg_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) { ; CIVI-LABEL: fneg_fabs_v4f16: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: s_or_b32 s3, s3, 0x80008000 ; CIVI-NEXT: s_or_b32 s2, s2, 0x80008000 @@ -435,7 +435,7 @@ define amdgpu_kernel void @fneg_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in ; ; GFX9-LABEL: fneg_fabs_v4f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_or_b32 s3, s3, 0x80008000 @@ -447,7 +447,7 @@ define amdgpu_kernel void @fneg_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in ; ; GFX11-LABEL: fneg_fabs_v4f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_or_b32 s2, s2, 0x80008000 ; GFX11-NEXT: s_or_b32 s3, s3, 0x80008000 @@ -467,12 +467,12 @@ define amdgpu_kernel void @fneg_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) #0 { ; CI-LABEL: fold_user_fneg_fabs_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[6:7], 0x2 +; CI-NEXT: s_load_dword s0, s[4:5], 0x2 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s1, s0, 16 ; CI-NEXT: v_cvt_f32_f16_e64 v1, |s1| ; CI-NEXT: v_cvt_f32_f16_e64 v0, |s0| -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mul_f32_e32 v1, -4.0, v1 ; CI-NEXT: v_mul_f32_e32 v0, -4.0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -487,8 +487,8 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x ; ; VI-LABEL: fold_user_fneg_fabs_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0xc400 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 @@ -503,8 +503,8 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x ; ; GFX9-LABEL: fold_user_fneg_fabs_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff7fff @@ -515,11 +515,11 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x ; GFX11-LABEL: fold_user_fneg_fabs_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0x7fff7fff +; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff7fff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_pk_mul_f16 v1, s2, -4.0 op_sel_hi:[1,0] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -536,8 +536,8 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x half> %in) { ; CI-LABEL: s_fneg_multi_use_fabs_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dword s4, s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dword s4, s[4:5], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: s_and_b32 s0, s4, 0x7fff7fff @@ -553,8 +553,8 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(ptr addrspace(1) %out0, p ; ; VI-LABEL: s_fneg_multi_use_fabs_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_and_b32 s0, s4, 0x7fff7fff @@ -570,11 +570,11 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(ptr addrspace(1) %out0, p ; ; GFX9-LABEL: s_fneg_multi_use_fabs_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s4, s4, 0x7fff7fff +; GFX9-NEXT: s_and_b32 s4, s6, 0x7fff7fff ; GFX9-NEXT: s_xor_b32 s5, s4, 0x80008000 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] @@ -585,8 +585,8 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(ptr addrspace(1) %out0, p ; GFX11-LABEL: s_fneg_multi_use_fabs_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x10 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x10 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s4, s4, 0x7fff7fff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) @@ -609,8 +609,8 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(ptr addrspace(1) %out0, p define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x half> %in) { ; CI-LABEL: s_fneg_multi_use_fabs_foldable_neg_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dword s4, s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dword s4, s[4:5], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: s_bfe_u32 s0, s4, 0xf0010 @@ -633,8 +633,8 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(ptr addrspac ; ; VI-LABEL: s_fneg_multi_use_fabs_foldable_neg_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: v_mov_b32_e32 v5, 0xc400 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -654,11 +654,11 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(ptr addrspac ; ; GFX9-LABEL: s_fneg_multi_use_fabs_foldable_neg_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s4, s4, 0x7fff7fff +; GFX9-NEXT: s_and_b32 s4, s6, 0x7fff7fff ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_pk_mul_f16 v1, s4, -4.0 op_sel_hi:[1,0] ; GFX9-NEXT: global_store_dword v0, v2, s[0:1] @@ -668,8 +668,8 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(ptr addrspac ; GFX11-LABEL: s_fneg_multi_use_fabs_foldable_neg_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x10 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x10 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s4, s4, 0x7fff7fff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll index 6446145bbfe2a..3c000d4fa63a3 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @fneg_fabsf_fadd_f32(ptr addrspace(1) %out, float %x, float %y) { ; SI-LABEL: fneg_fabsf_fadd_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -18,7 +18,7 @@ define amdgpu_kernel void @fneg_fabsf_fadd_f32(ptr addrspace(1) %out, float %x, ; ; VI-LABEL: fneg_fabsf_fadd_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_sub_f32_e64 v2, s3, |v0| @@ -36,7 +36,7 @@ define amdgpu_kernel void @fneg_fabsf_fadd_f32(ptr addrspace(1) %out, float %x, define amdgpu_kernel void @fneg_fabsf_fmul_f32(ptr addrspace(1) %out, float %x, float %y) { ; SI-LABEL: fneg_fabsf_fmul_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -49,7 +49,7 @@ define amdgpu_kernel void @fneg_fabsf_fmul_f32(ptr addrspace(1) %out, float %x, ; ; VI-LABEL: fneg_fabsf_fmul_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mul_f32_e64 v2, s3, -|v0| @@ -67,11 +67,11 @@ define amdgpu_kernel void @fneg_fabsf_fmul_f32(ptr addrspace(1) %out, float %x, define amdgpu_kernel void @fneg_fabsf_free_f32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: fneg_fabsf_free_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bitset1_b32 s4, 31 +; SI-NEXT: s_or_b32 s4, s2, 0x80000000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -79,10 +79,10 @@ define amdgpu_kernel void @fneg_fabsf_free_f32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: fneg_fabsf_free_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_or_b32 s2, s4, 0x80000000 +; VI-NEXT: s_bitset1_b32 s2, 31 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -129,11 +129,11 @@ define amdgpu_kernel void @fneg_fabsf_fn_free_f32(ptr addrspace(1) %out, i32 %in define amdgpu_kernel void @fneg_fabsf_f32(ptr addrspace(1) %out, float %in) { ; SI-LABEL: fneg_fabsf_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bitset1_b32 s4, 31 +; SI-NEXT: s_or_b32 s4, s2, 0x80000000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -141,10 +141,10 @@ define amdgpu_kernel void @fneg_fabsf_f32(ptr addrspace(1) %out, float %in) { ; ; VI-LABEL: fneg_fabsf_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_or_b32 s2, s4, 0x80000000 +; VI-NEXT: s_bitset1_b32 s2, 31 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -159,7 +159,7 @@ define amdgpu_kernel void @fneg_fabsf_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @v_fneg_fabsf_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_fneg_fabsf_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -177,7 +177,7 @@ define amdgpu_kernel void @v_fneg_fabsf_f32(ptr addrspace(1) %out, ptr addrspace ; ; VI-LABEL: v_fneg_fabsf_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -198,7 +198,7 @@ define amdgpu_kernel void @v_fneg_fabsf_f32(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @fneg_fabsf_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; SI-LABEL: fneg_fabsf_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bitset1_b32 s3, 31 @@ -213,7 +213,7 @@ define amdgpu_kernel void @fneg_fabsf_v2f32(ptr addrspace(1) %out, <2 x float> % ; ; VI-LABEL: fneg_fabsf_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitset1_b32 s3, 31 ; VI-NEXT: s_bitset1_b32 s2, 31 @@ -232,8 +232,8 @@ define amdgpu_kernel void @fneg_fabsf_v2f32(ptr addrspace(1) %out, <2 x float> % define amdgpu_kernel void @fneg_fabsf_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; SI-LABEL: fneg_fabsf_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bitset1_b32 s7, 31 @@ -250,8 +250,8 @@ define amdgpu_kernel void @fneg_fabsf_v4f32(ptr addrspace(1) %out, <4 x float> % ; ; VI-LABEL: fneg_fabsf_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_or_b32 s2, s7, 0x80000000 ; VI-NEXT: s_or_b32 s3, s6, 0x80000000 diff --git a/llvm/test/CodeGen/AMDGPU/fneg.ll b/llvm/test/CodeGen/AMDGPU/fneg.ll index e447429539e6f..d78bdfe08772a 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg.ll @@ -7,8 +7,8 @@ define amdgpu_kernel void @s_fneg_f32(ptr addrspace(1) %out, float %in) { ; SI-LABEL: s_fneg_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -19,10 +19,10 @@ define amdgpu_kernel void @s_fneg_f32(ptr addrspace(1) %out, float %in) { ; ; VI-LABEL: s_fneg_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_xor_b32 s2, s4, 0x80000000 +; VI-NEXT: s_xor_b32 s2, s2, 0x80000000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -32,10 +32,10 @@ define amdgpu_kernel void @s_fneg_f32(ptr addrspace(1) %out, float %in) { ; GFX11-LABEL: s_fneg_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_xor_b32 s2, s4, 0x80000000 +; GFX11-NEXT: s_xor_b32 s2, s2, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -50,7 +50,7 @@ define amdgpu_kernel void @s_fneg_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @s_fneg_v2f32(ptr addrspace(1) nocapture %out, <2 x float> %in) { ; SI-LABEL: s_fneg_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -65,7 +65,7 @@ define amdgpu_kernel void @s_fneg_v2f32(ptr addrspace(1) nocapture %out, <2 x fl ; ; VI-LABEL: s_fneg_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_xor_b32 s3, s3, 0x80000000 ; VI-NEXT: s_xor_b32 s2, s2, 0x80000000 @@ -78,7 +78,7 @@ define amdgpu_kernel void @s_fneg_v2f32(ptr addrspace(1) nocapture %out, <2 x fl ; ; GFX11-LABEL: s_fneg_v2f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_xor_b32 s2, s2, 0x80000000 ; GFX11-NEXT: s_xor_b32 s3, s3, 0x80000000 @@ -97,8 +97,8 @@ define amdgpu_kernel void @s_fneg_v2f32(ptr addrspace(1) nocapture %out, <2 x fl define amdgpu_kernel void @s_fneg_v4f32(ptr addrspace(1) nocapture %out, <4 x float> %in) { ; SI-LABEL: s_fneg_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -115,8 +115,8 @@ define amdgpu_kernel void @s_fneg_v4f32(ptr addrspace(1) nocapture %out, <4 x fl ; ; VI-LABEL: s_fneg_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_xor_b32 s2, s7, 0x80000000 ; VI-NEXT: s_xor_b32 s3, s6, 0x80000000 @@ -134,8 +134,8 @@ define amdgpu_kernel void @s_fneg_v4f32(ptr addrspace(1) nocapture %out, <4 x fl ; GFX11-LABEL: s_fneg_v4f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_xor_b32 s2, s7, 0x80000000 ; GFX11-NEXT: s_xor_b32 s3, s6, 0x80000000 @@ -157,8 +157,8 @@ define amdgpu_kernel void @s_fneg_v4f32(ptr addrspace(1) nocapture %out, <4 x fl define amdgpu_kernel void @fsub0_f32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: fsub0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -168,10 +168,10 @@ define amdgpu_kernel void @fsub0_f32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: fsub0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_sub_f32_e64 v2, 0, s4 +; VI-NEXT: v_sub_f32_e64 v2, 0, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -180,11 +180,11 @@ define amdgpu_kernel void @fsub0_f32(ptr addrspace(1) %out, i32 %in) { ; GFX11-LABEL: fsub0_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_sub_f32_e64 v1, 0, s4 +; GFX11-NEXT: v_sub_f32_e64 v1, 0, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -198,8 +198,8 @@ define amdgpu_kernel void @fsub0_f32(ptr addrspace(1) %out, i32 %in) { define amdgpu_kernel void @fneg_free_f32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: fneg_free_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -210,10 +210,10 @@ define amdgpu_kernel void @fneg_free_f32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: fneg_free_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_xor_b32 s2, s4, 0x80000000 +; VI-NEXT: s_xor_b32 s2, s2, 0x80000000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -223,10 +223,10 @@ define amdgpu_kernel void @fneg_free_f32(ptr addrspace(1) %out, i32 %in) { ; GFX11-LABEL: fneg_free_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_xor_b32 s2, s4, 0x80000000 +; GFX11-NEXT: s_xor_b32 s2, s2, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -242,8 +242,8 @@ define amdgpu_kernel void @fneg_free_f32(ptr addrspace(1) %out, i32 %in) { define amdgpu_kernel void @fneg_fold_f32(ptr addrspace(1) %out, float %in) { ; SI-LABEL: fneg_fold_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -253,10 +253,10 @@ define amdgpu_kernel void @fneg_fold_f32(ptr addrspace(1) %out, float %in) { ; ; VI-LABEL: fneg_fold_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mul_f32_e64 v2, -s4, s4 +; VI-NEXT: v_mul_f32_e64 v2, -s2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -265,11 +265,11 @@ define amdgpu_kernel void @fneg_fold_f32(ptr addrspace(1) %out, float %in) { ; GFX11-LABEL: fneg_fold_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mul_f32_e64 v1, -s4, s4 +; GFX11-NEXT: v_mul_f32_e64 v1, -s2, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -284,8 +284,8 @@ define amdgpu_kernel void @fneg_fold_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @bitpreserve_fneg_f32(ptr addrspace(1) %out, float %in) { ; SI-LABEL: bitpreserve_fneg_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -295,10 +295,10 @@ define amdgpu_kernel void @bitpreserve_fneg_f32(ptr addrspace(1) %out, float %in ; ; VI-LABEL: bitpreserve_fneg_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mul_f32_e64 v2, s4, -4.0 +; VI-NEXT: v_mul_f32_e64 v2, s2, -4.0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -307,11 +307,11 @@ define amdgpu_kernel void @bitpreserve_fneg_f32(ptr addrspace(1) %out, float %in ; GFX11-LABEL: bitpreserve_fneg_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mul_f32_e64 v1, s4, -4.0 +; GFX11-NEXT: v_mul_f32_e64 v1, s2, -4.0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -327,8 +327,8 @@ define amdgpu_kernel void @bitpreserve_fneg_f32(ptr addrspace(1) %out, float %in define amdgpu_kernel void @s_fneg_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: s_fneg_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -339,10 +339,10 @@ define amdgpu_kernel void @s_fneg_i32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: s_fneg_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_xor_b32 s2, s4, 0x80000000 +; VI-NEXT: s_xor_b32 s2, s2, 0x80000000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -352,10 +352,10 @@ define amdgpu_kernel void @s_fneg_i32(ptr addrspace(1) %out, i32 %in) { ; GFX11-LABEL: s_fneg_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_xor_b32 s2, s4, 0x80000000 +; GFX11-NEXT: s_xor_b32 s2, s2, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -380,8 +380,8 @@ define i32 @v_fneg_i32(i32 %in) { define amdgpu_kernel void @s_fneg_i32_fp_use(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: s_fneg_i32_fp_use: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -391,10 +391,10 @@ define amdgpu_kernel void @s_fneg_i32_fp_use(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: s_fneg_i32_fp_use: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_sub_f32_e64 v2, 2.0, s4 +; VI-NEXT: v_sub_f32_e64 v2, 2.0, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -403,11 +403,11 @@ define amdgpu_kernel void @s_fneg_i32_fp_use(ptr addrspace(1) %out, i32 %in) { ; GFX11-LABEL: s_fneg_i32_fp_use: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_sub_f32_e64 v1, 2.0, s4 +; GFX11-NEXT: v_sub_f32_e64 v1, 2.0, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -434,7 +434,7 @@ define float @v_fneg_i32_fp_use(i32 %in) { define amdgpu_kernel void @s_fneg_i64(ptr addrspace(1) %out, i64 %in) { ; SI-LABEL: s_fneg_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -448,7 +448,7 @@ define amdgpu_kernel void @s_fneg_i64(ptr addrspace(1) %out, i64 %in) { ; ; VI-LABEL: s_fneg_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_xor_b32 s0, s3, 0x80000000 @@ -460,7 +460,7 @@ define amdgpu_kernel void @s_fneg_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX11-LABEL: s_fneg_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_xor_b32 s3, s3, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -488,7 +488,7 @@ define i64 @v_fneg_i64(i64 %in) { define amdgpu_kernel void @s_fneg_i64_fp_use(ptr addrspace(1) %out, i64 %in) { ; SI-LABEL: s_fneg_i64_fp_use: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -500,7 +500,7 @@ define amdgpu_kernel void @s_fneg_i64_fp_use(ptr addrspace(1) %out, i64 %in) { ; ; VI-LABEL: s_fneg_i64_fp_use: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_f64 v[0:1], -s[2:3], 2.0 ; VI-NEXT: v_mov_b32_e32 v2, s0 @@ -510,7 +510,7 @@ define amdgpu_kernel void @s_fneg_i64_fp_use(ptr addrspace(1) %out, i64 %in) { ; ; GFX11-LABEL: s_fneg_i64_fp_use: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_f64 v[0:1], -s[2:3], 2.0 @@ -550,24 +550,23 @@ define i16 @v_fneg_i16(i16 %in) { define amdgpu_kernel void @s_fneg_i16_fp_use(ptr addrspace(1) %out, i16 %in) { ; SI-LABEL: s_fneg_i16_fp_use: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_sub_f32_e32 v0, 2.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_fneg_i16_fp_use: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_sub_f16_e64 v2, 2.0, s4 +; VI-NEXT: v_sub_f16_e64 v2, 2.0, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_short v[0:1], v2 @@ -576,11 +575,11 @@ define amdgpu_kernel void @s_fneg_i16_fp_use(ptr addrspace(1) %out, i16 %in) { ; GFX11-LABEL: s_fneg_i16_fp_use: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_sub_f16_e64 v1, 2.0, s4 +; GFX11-NEXT: v_sub_f16_e64 v1, 2.0, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -620,8 +619,8 @@ define half @v_fneg_i16_fp_use(i16 %in) { define amdgpu_kernel void @s_fneg_v2i16(ptr addrspace(1) %out, i32 %arg) { ; SI-LABEL: s_fneg_v2i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -632,15 +631,15 @@ define amdgpu_kernel void @s_fneg_v2i16(ptr addrspace(1) %out, i32 %arg) { ; ; VI-LABEL: s_fneg_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s2, s4, 16 -; VI-NEXT: s_xor_b32 s3, s4, 0x8000 +; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: s_xor_b32 s2, s2, 0x8000 -; VI-NEXT: s_and_b32 s3, s3, 0xffff -; VI-NEXT: s_lshl_b32 s2, s2, 16 -; VI-NEXT: s_or_b32 s2, s3, s2 +; VI-NEXT: s_xor_b32 s3, s3, 0x8000 +; VI-NEXT: s_and_b32 s2, s2, 0xffff +; VI-NEXT: s_lshl_b32 s3, s3, 16 +; VI-NEXT: s_or_b32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -650,10 +649,10 @@ define amdgpu_kernel void @s_fneg_v2i16(ptr addrspace(1) %out, i32 %arg) { ; GFX11-LABEL: s_fneg_v2i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_xor_b32 s2, s4, 0x80008000 +; GFX11-NEXT: s_xor_b32 s2, s2, 0x80008000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -696,35 +695,34 @@ define <2 x i16> @v_fneg_v2i16(<2 x i16> %in) { define amdgpu_kernel void @s_fneg_v2i16_fp_use(ptr addrspace(1) %out, i32 %arg) { ; SI-LABEL: s_fneg_v2i16_fp_use: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s1, s0, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_lshr_b32 s3, s2, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s2 ; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_sub_f32_e32 v0, 2.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_sub_f32_e32 v1, 2.0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_fneg_v2i16_fp_use: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x4000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s2, s4, 16 +; VI-NEXT: s_lshr_b32 s3, s2, 16 +; VI-NEXT: s_xor_b32 s3, s3, 0x8000 ; VI-NEXT: s_xor_b32 s2, s2, 0x8000 -; VI-NEXT: s_xor_b32 s3, s4, 0x8000 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_add_f16_e64 v1, s3, 2.0 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_add_f16_e64 v1, s2, 2.0 ; VI-NEXT: v_add_f16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v1, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -735,11 +733,11 @@ define amdgpu_kernel void @s_fneg_v2i16_fp_use(ptr addrspace(1) %out, i32 %arg) ; GFX11-LABEL: s_fneg_v2i16_fp_use: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_add_f16 v1, s4, 2.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0] +; GFX11-NEXT: v_pk_add_f16 v1, s2, 2.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll index 1914b74be1909..9f339af0f5580 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll @@ -16,24 +16,18 @@ declare <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %d define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, <2 x half> %data) { ; GFX12-SDAG-LABEL: local_atomic_fadd_v2f16_noret: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SE ; GFX12-SDAG-NEXT: ds_pk_add_f16 v0, v1 -; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 -; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: local_atomic_fadd_v2f16_noret: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SE ; GFX12-GISEL-NEXT: ds_pk_add_f16 v0, v1 -; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 -; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE ; GFX12-GISEL-NEXT: s_endpgm %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0) ret void @@ -42,24 +36,22 @@ define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, <2 x i16> %data) { ; GFX12-SDAG-LABEL: local_atomic_fadd_v2bf16_noret: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SE ; GFX12-SDAG-NEXT: ds_pk_add_bf16 v0, v1 ; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 -; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE +; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: local_atomic_fadd_v2bf16_noret: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SE -; GFX12-GISEL-NEXT: ds_pk_add_f16 v0, v1 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 +; GFX12-GISEL-NEXT: ds_pk_add_bf16 v1, v0 ; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 -; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE +; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_endpgm %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data) ret void @@ -73,11 +65,8 @@ define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half> ; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SE -; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX12-SDAG-NEXT: ds_pk_add_rtn_f16 v0, v0, v1 ; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 -; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-GISEL-LABEL: local_atomic_fadd_v2f16_rtn: @@ -87,11 +76,8 @@ define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half> ; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SE -; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: ds_pk_add_rtn_f16 v0, v0, v1 ; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 -; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0) ret <2 x half> %ret @@ -105,11 +91,10 @@ define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16> ; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SE ; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX12-SDAG-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1 ; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 -; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE +; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-GISEL-LABEL: local_atomic_fadd_v2bf16_rtn: @@ -119,11 +104,10 @@ define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16> ; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SE ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 -; GFX12-GISEL-NEXT: ds_pk_add_rtn_f16 v0, v0, v1 +; GFX12-GISEL-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1 ; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 -; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE +; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data) ret <2 x i16> %ret @@ -132,7 +116,7 @@ define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16> define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(ptr %ptr, <2 x half> %data) { ; GFX12-SDAG-LABEL: flat_atomic_fadd_v2f16_noret: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2 @@ -141,7 +125,7 @@ define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(ptr %ptr, <2 x half> %da ; ; GFX12-GISEL-LABEL: flat_atomic_fadd_v2f16_noret: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2 @@ -180,7 +164,7 @@ define <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) { define amdgpu_kernel void @flat_atomic_fadd_v2bf16_noret(ptr %ptr, <2 x i16> %data) { ; GFX12-SDAG-LABEL: flat_atomic_fadd_v2bf16_noret: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2 @@ -189,7 +173,7 @@ define amdgpu_kernel void @flat_atomic_fadd_v2bf16_noret(ptr %ptr, <2 x i16> %da ; ; GFX12-GISEL-LABEL: flat_atomic_fadd_v2bf16_noret: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2 @@ -228,7 +212,7 @@ define <2 x i16> @flat_atomic_fadd_v2bf16_rtn(ptr %ptr, <2 x i16> %data) { define amdgpu_kernel void @global_atomic_fadd_v2bf16_noret(ptr addrspace(1) %ptr, <2 x i16> %data) { ; GFX12-SDAG-LABEL: global_atomic_fadd_v2bf16_noret: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: global_atomic_pk_add_bf16 v0, v1, s[0:1] @@ -238,7 +222,7 @@ define amdgpu_kernel void @global_atomic_fadd_v2bf16_noret(ptr addrspace(1) %ptr ; ; GFX12-GISEL-LABEL: global_atomic_fadd_v2bf16_noret: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-GISEL-NEXT: global_atomic_pk_add_bf16 v1, v0, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll index 95d8ca391b843..f55e9f4821b47 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll @@ -14,17 +14,17 @@ declare <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> % define amdgpu_kernel void @flat_atomic_fadd_f32_noret(ptr %ptr, float %data) { ; GFX940-LABEL: flat_atomic_fadd_f32_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX940-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 ; GFX940-NEXT: s_endpgm ; ; GFX12-LABEL: flat_atomic_fadd_f32_noret: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_mov_b32_e32 v2, s2 @@ -37,7 +37,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret(ptr %ptr, float %data) { define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) { ; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -49,7 +49,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) { ; ; GFX12-LABEL: flat_atomic_fadd_f32_noret_pat: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: s_mov_b32 s0, 0 @@ -58,8 +58,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) { ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_add_f32_e32 v2, 4.0, v3 -; GFX12-NEXT: global_wb scope:SCOPE_SYS -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -77,7 +76,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) { define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 { ; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat_ieee: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -89,7 +88,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 { ; ; GFX12-LABEL: flat_atomic_fadd_f32_noret_pat_ieee: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: s_mov_b32 s0, 0 @@ -98,8 +97,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 { ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_add_f32_e32 v2, 4.0, v3 -; GFX12-NEXT: global_wb scope:SCOPE_SYS -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -162,9 +160,8 @@ define float @flat_atomic_fadd_f32_rtn_pat(ptr %ptr, float %data) { ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_f32_e32 v2, 4.0, v3 -; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -183,17 +180,17 @@ define float @flat_atomic_fadd_f32_rtn_pat(ptr %ptr, float %data) { define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(ptr %ptr, <2 x half> %data) { ; GFX940-LABEL: flat_atomic_fadd_v2f16_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX940-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 ; GFX940-NEXT: s_endpgm ; ; GFX12-LABEL: flat_atomic_fadd_v2f16_noret: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_mov_b32_e32 v2, s2 @@ -228,17 +225,17 @@ define <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) { define amdgpu_kernel void @flat_atomic_fadd_v2bf16_noret(ptr %ptr, <2 x i16> %data) { ; GFX940-LABEL: flat_atomic_fadd_v2bf16_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX940-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 ; GFX940-NEXT: s_endpgm ; ; GFX12-LABEL: flat_atomic_fadd_v2bf16_noret: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_mov_b32_e32 v2, s2 @@ -273,17 +270,17 @@ define <2 x i16> @flat_atomic_fadd_v2bf16_rtn(ptr %ptr, <2 x i16> %data) { define amdgpu_kernel void @global_atomic_fadd_v2bf16_noret(ptr addrspace(1) %ptr, <2 x i16> %data) { ; GFX940-LABEL: global_atomic_fadd_v2bf16_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v1, s[0:1] +; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v1, s[2:3] ; GFX940-NEXT: s_endpgm ; ; GFX12-LABEL: global_atomic_fadd_v2bf16_noret: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v1, s[0:1] @@ -319,23 +316,19 @@ define <2 x i16> @global_atomic_fadd_v2bf16_rtn(ptr addrspace(1) %ptr, <2 x i16> define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, <2 x half> %data) { ; GFX940-LABEL: local_atomic_fadd_v2f16_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s0 -; GFX940-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-NEXT: v_mov_b32_e32 v1, s3 ; GFX940-NEXT: ds_pk_add_f16 v0, v1 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: s_endpgm ; ; GFX12-LABEL: local_atomic_fadd_v2f16_noret: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: ds_pk_add_f16 v0, v1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0) ret void @@ -356,11 +349,8 @@ define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half> ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_wb scope:SCOPE_SE -; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_pk_add_rtn_f16 v0, v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0) ret <2 x half> %ret @@ -369,23 +359,24 @@ define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half> define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, <2 x i16> %data) { ; GFX940-LABEL: local_atomic_fadd_v2bf16_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s0 -; GFX940-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: ds_pk_add_bf16 v0, v1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: s_endpgm ; ; GFX12-LABEL: local_atomic_fadd_v2bf16_noret: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: ds_pk_add_bf16 v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data) ret void @@ -395,8 +386,10 @@ define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16> ; GFX940-LABEL: local_atomic_fadd_v2bf16_rtn: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: local_atomic_fadd_v2bf16_rtn: @@ -406,14 +399,307 @@ define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16> ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data) ret <2 x i16> %ret } +define float @flat_atomic_fadd_f32_intrinsic_ret__posoffset(ptr %ptr, float %data) { +; GFX940-LABEL: flat_atomic_fadd_f32_intrinsic_ret__posoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:4092 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_atomic_fadd_f32_intrinsic_ret__posoffset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:4092 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr float, ptr %ptr, i64 1023 + %result = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %gep, float %data) + ret float %result +} + +define float @flat_atomic_fadd_f32_intrinsic_ret__negoffset(ptr %ptr, float %data) { +; GFX940-LABEL: flat_atomic_fadd_f32_intrinsic_ret__negoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffc00, v0 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_atomic_fadd_f32_intrinsic_ret__negoffset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:-1024 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr float, ptr %ptr, i64 -256 + %result = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %gep, float %data) + ret float %result +} + +define void @flat_atomic_fadd_f32_intrinsic_noret__posoffset(ptr %ptr, float %data) { +; GFX940-LABEL: flat_atomic_fadd_f32_intrinsic_noret__posoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:4092 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_atomic_fadd_f32_intrinsic_noret__posoffset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:4092 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr float, ptr %ptr, i64 1023 + %unused = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %gep, float %data) + ret void +} + +define void @flat_atomic_fadd_f32_intrinsic_noret__negoffset(ptr %ptr, float %data) { +; GFX940-LABEL: flat_atomic_fadd_f32_intrinsic_noret__negoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffc00, v0 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_atomic_fadd_f32_intrinsic_noret__negoffset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:-1024 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr float, ptr %ptr, i64 -256 + %unused = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %gep, float %data) + ret void +} + +define <2 x half> @flat_atomic_fadd_v2f16_intrinsic_ret__posoffset(ptr %ptr, <2 x half> %data) { +; GFX940-LABEL: flat_atomic_fadd_v2f16_intrinsic_ret__posoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:4092 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_atomic_fadd_v2f16_intrinsic_ret__posoffset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:4092 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x half>, ptr %ptr, i64 1023 + %result = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %gep, <2 x half> %data) + ret <2 x half> %result +} + +define <2 x half> @flat_atomic_fadd_v2f16_intrinsic_ret__negoffset(ptr %ptr, <2 x half> %data) { +; GFX940-LABEL: flat_atomic_fadd_v2f16_intrinsic_ret__negoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffc00, v0 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_atomic_fadd_v2f16_intrinsic_ret__negoffset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:-1024 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x half>, ptr %ptr, i64 -256 + %result = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %gep, <2 x half> %data) + ret <2 x half> %result +} + +define void @flat_atomic_fadd_v2f16_intrinsic_noret__posoffset(ptr %ptr, <2 x half> %data) { +; GFX940-LABEL: flat_atomic_fadd_v2f16_intrinsic_noret__posoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:4092 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_atomic_fadd_v2f16_intrinsic_noret__posoffset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:4092 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x half>, ptr %ptr, i64 1023 + %unused = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %gep, <2 x half> %data) + ret void +} + +define void @flat_atomic_fadd_v2f16_intrinsic_noret__negoffset(ptr %ptr, <2 x half> %data) { +; GFX940-LABEL: flat_atomic_fadd_v2f16_intrinsic_noret__negoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffc00, v0 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_atomic_fadd_v2f16_intrinsic_noret__negoffset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:-1024 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x half>, ptr %ptr, i64 -256 + %unused = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %gep, <2 x half> %data) + ret void +} + +define <2 x i16> @flat_atomic_fadd_v2bf16_intrinsic_ret__posoffset(ptr %ptr, <2 x i16> %data) { +; GFX940-LABEL: flat_atomic_fadd_v2bf16_intrinsic_ret__posoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:4092 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_atomic_fadd_v2bf16_intrinsic_ret__posoffset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:4092 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x i16>, ptr %ptr, i64 1023 + %result = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0.v2bf16(ptr %gep, <2 x i16> %data) + ret <2 x i16> %result +} + +define <2 x i16> @flat_atomic_fadd_v2bf16_intrinsic_ret__negoffset(ptr %ptr, <2 x i16> %data) { +; GFX940-LABEL: flat_atomic_fadd_v2bf16_intrinsic_ret__negoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffc00, v0 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_atomic_fadd_v2bf16_intrinsic_ret__negoffset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:-1024 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x i16>, ptr %ptr, i64 -256 + %result = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0.v2bf16(ptr %gep, <2 x i16> %data) + ret <2 x i16> %result +} + +define void @flat_atomic_fadd_v2bf16_intrinsic_noret__posoffset(ptr %ptr, <2 x i16> %data) { +; GFX940-LABEL: flat_atomic_fadd_v2bf16_intrinsic_noret__posoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:4092 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_atomic_fadd_v2bf16_intrinsic_noret__posoffset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:4092 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x i16>, ptr %ptr, i64 1023 + %unused = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0.v2bf16(ptr %gep, <2 x i16> %data) + ret void +} + +define void @flat_atomic_fadd_v2bf16_intrinsic_noret__negoffset(ptr %ptr, <2 x i16> %data) { +; GFX940-LABEL: flat_atomic_fadd_v2bf16_intrinsic_noret__negoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffc00, v0 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_atomic_fadd_v2bf16_intrinsic_noret__negoffset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:-1024 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x i16>, ptr %ptr, i64 -256 + %unused = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0.v2bf16(ptr %gep, <2 x i16> %data) + ret void +} + attributes #0 = { "denormal-fp-math-f32"="ieee,ieee" } diff --git a/llvm/test/CodeGen/AMDGPU/fp-classify.ll b/llvm/test/CodeGen/AMDGPU/fp-classify.ll index fb731cc00d3f0..18d2e52e8f900 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-classify.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-classify.ll @@ -9,24 +9,24 @@ declare double @llvm.fabs.f64(double) #1 define amdgpu_kernel void @test_isinf_pattern(ptr addrspace(1) nocapture %out, float %x) #0 { ; SI-LABEL: test_isinf_pattern: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x204 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0 +; SI-NEXT: v_cmp_class_f32_e32 vcc, s0, v0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_isinf_pattern: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x204 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0 +; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -36,11 +36,11 @@ define amdgpu_kernel void @test_isinf_pattern(ptr addrspace(1) nocapture %out, f ; GFX11-LABEL: test_isinf_pattern: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_class_f32_e64 s2, s4, 0x204 +; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x204 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -57,24 +57,24 @@ define amdgpu_kernel void @test_isinf_pattern(ptr addrspace(1) nocapture %out, f define amdgpu_kernel void @test_not_isinf_pattern_0(ptr addrspace(1) nocapture %out, float %x) #0 { ; SI-LABEL: test_not_isinf_pattern_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x7f800000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_nlg_f32_e64 s[4:5], |s4|, v0 -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: v_cmp_nlg_f32_e64 s[0:1], |s0|, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_not_isinf_pattern_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x7f800000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_nlg_f32_e64 s[2:3], |s4|, v0 +; VI-NEXT: v_cmp_nlg_f32_e64 s[2:3], |s2|, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3] ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -84,11 +84,11 @@ define amdgpu_kernel void @test_not_isinf_pattern_0(ptr addrspace(1) nocapture % ; GFX11-LABEL: test_not_isinf_pattern_0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nlg_f32_e64 s2, 0x7f800000, |s4| +; GFX11-NEXT: v_cmp_nlg_f32_e64 s2, 0x7f800000, |s2| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -105,7 +105,7 @@ define amdgpu_kernel void @test_not_isinf_pattern_0(ptr addrspace(1) nocapture % define amdgpu_kernel void @test_not_isinf_pattern_1(ptr addrspace(1) nocapture %out, float %x) #0 { ; SI-LABEL: test_not_isinf_pattern_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -115,7 +115,7 @@ define amdgpu_kernel void @test_not_isinf_pattern_1(ptr addrspace(1) nocapture % ; ; VI-LABEL: test_not_isinf_pattern_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -125,7 +125,7 @@ define amdgpu_kernel void @test_not_isinf_pattern_1(ptr addrspace(1) nocapture % ; ; GFX11-LABEL: test_not_isinf_pattern_1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v0, s[0:1] @@ -142,24 +142,24 @@ define amdgpu_kernel void @test_not_isinf_pattern_1(ptr addrspace(1) nocapture % define amdgpu_kernel void @test_isfinite_pattern_0(ptr addrspace(1) nocapture %out, float %x) #0 { ; SI-LABEL: test_isfinite_pattern_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0 +; SI-NEXT: v_cmp_class_f32_e32 vcc, s0, v0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_isfinite_pattern_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0 +; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -169,11 +169,11 @@ define amdgpu_kernel void @test_isfinite_pattern_0(ptr addrspace(1) nocapture %o ; GFX11-LABEL: test_isfinite_pattern_0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_class_f32_e64 s2, s4, 0x1f8 +; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x1f8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -192,24 +192,24 @@ define amdgpu_kernel void @test_isfinite_pattern_0(ptr addrspace(1) nocapture %o define amdgpu_kernel void @test_isfinite_pattern_1(ptr addrspace(1) nocapture %out, float %x) #0 { ; SI-LABEL: test_isfinite_pattern_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0 +; SI-NEXT: v_cmp_class_f32_e32 vcc, s0, v0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_isfinite_pattern_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0 +; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -219,11 +219,11 @@ define amdgpu_kernel void @test_isfinite_pattern_1(ptr addrspace(1) nocapture %o ; GFX11-LABEL: test_isfinite_pattern_1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_class_f32_e64 s2, s4, 0x1f8 +; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x1f8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -241,8 +241,8 @@ define amdgpu_kernel void @test_isfinite_pattern_1(ptr addrspace(1) nocapture %o define amdgpu_kernel void @test_isfinite_not_pattern_0(ptr addrspace(1) nocapture %out, float %x) #0 { ; SI-LABEL: test_isfinite_not_pattern_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -253,10 +253,10 @@ define amdgpu_kernel void @test_isfinite_not_pattern_0(ptr addrspace(1) nocaptur ; ; VI-LABEL: test_isfinite_not_pattern_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_o_f32_e64 s[2:3], s4, s4 +; VI-NEXT: v_cmp_o_f32_e64 s[2:3], s2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3] ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -266,11 +266,11 @@ define amdgpu_kernel void @test_isfinite_not_pattern_0(ptr addrspace(1) nocaptur ; GFX11-LABEL: test_isfinite_not_pattern_0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_o_f32_e64 s2, s4, s4 +; GFX11-NEXT: v_cmp_o_f32_e64 s2, s2, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -290,23 +290,23 @@ define amdgpu_kernel void @test_isfinite_not_pattern_0(ptr addrspace(1) nocaptur define amdgpu_kernel void @test_isfinite_not_pattern_1(ptr addrspace(1) nocapture %out, float %x) #0 { ; SI-LABEL: test_isfinite_not_pattern_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x7f800000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_o_f32_e64 s[4:5], s6, s6 -; SI-NEXT: v_cmp_neq_f32_e32 vcc, s6, v0 -; SI-NEXT: s_and_b64 s[4:5], s[4:5], vcc -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: v_cmp_o_f32_e64 s[0:1], s2, s2 +; SI-NEXT: v_cmp_neq_f32_e32 vcc, s2, v0 +; SI-NEXT: s_and_b64 s[0:1], s[0:1], vcc +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_isfinite_not_pattern_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x7f800000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cmp_o_f32_e64 s[2:3], s4, s4 @@ -321,14 +321,14 @@ define amdgpu_kernel void @test_isfinite_not_pattern_1(ptr addrspace(1) nocaptur ; GFX11-LABEL: test_isfinite_not_pattern_1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_o_f32_e64 s2, s4, s4 -; GFX11-NEXT: v_cmp_neq_f32_e64 s3, 0x7f800000, s4 +; GFX11-NEXT: v_cmp_o_f32_e64 s3, s2, s2 +; GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x7f800000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 s2, s2, s3 +; GFX11-NEXT: s_and_b32 s2, s3, s2 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -346,7 +346,7 @@ define amdgpu_kernel void @test_isfinite_not_pattern_1(ptr addrspace(1) nocaptur define amdgpu_kernel void @test_isfinite_not_pattern_2(ptr addrspace(1) nocapture %out, float %x, float %y) #0 { ; SI-LABEL: test_isfinite_not_pattern_2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x7f800000 @@ -362,7 +362,7 @@ define amdgpu_kernel void @test_isfinite_not_pattern_2(ptr addrspace(1) nocaptur ; ; VI-LABEL: test_isfinite_not_pattern_2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x7f800000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cmp_o_f32_e64 s[4:5], s2, s2 @@ -376,7 +376,7 @@ define amdgpu_kernel void @test_isfinite_not_pattern_2(ptr addrspace(1) nocaptur ; ; GFX11-LABEL: test_isfinite_not_pattern_2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_o_f32_e64 s2, s2, s2 @@ -401,23 +401,23 @@ define amdgpu_kernel void @test_isfinite_not_pattern_2(ptr addrspace(1) nocaptur define amdgpu_kernel void @test_isfinite_not_pattern_3(ptr addrspace(1) nocapture %out, float %x) #0 { ; SI-LABEL: test_isfinite_not_pattern_3: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x7f800000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_u_f32_e64 s[4:5], s6, s6 -; SI-NEXT: v_cmp_neq_f32_e64 s[6:7], |s6|, v0 -; SI-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: v_cmp_u_f32_e64 s[0:1], s2, s2 +; SI-NEXT: v_cmp_neq_f32_e64 s[2:3], |s2|, v0 +; SI-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_isfinite_not_pattern_3: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x7f800000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cmp_u_f32_e64 s[2:3], s4, s4 @@ -432,14 +432,14 @@ define amdgpu_kernel void @test_isfinite_not_pattern_3(ptr addrspace(1) nocaptur ; GFX11-LABEL: test_isfinite_not_pattern_3: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_u_f32_e64 s2, s4, s4 -; GFX11-NEXT: v_cmp_neq_f32_e64 s3, 0x7f800000, |s4| +; GFX11-NEXT: v_cmp_u_f32_e64 s3, s2, s2 +; GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x7f800000, |s2| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 s2, s2, s3 +; GFX11-NEXT: s_and_b32 s2, s3, s2 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -457,24 +457,24 @@ define amdgpu_kernel void @test_isfinite_not_pattern_3(ptr addrspace(1) nocaptur define amdgpu_kernel void @test_isfinite_pattern_4(ptr addrspace(1) nocapture %out, float %x) #0 { ; SI-LABEL: test_isfinite_pattern_4: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0 +; SI-NEXT: v_cmp_class_f32_e32 vcc, s0, v0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_isfinite_pattern_4: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0 +; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -484,11 +484,11 @@ define amdgpu_kernel void @test_isfinite_pattern_4(ptr addrspace(1) nocapture %o ; GFX11-LABEL: test_isfinite_pattern_4: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_class_f32_e64 s2, s4, 0x1f8 +; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x1f8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -507,24 +507,24 @@ define amdgpu_kernel void @test_isfinite_pattern_4(ptr addrspace(1) nocapture %o define amdgpu_kernel void @test_isfinite_pattern_4_commute_and(ptr addrspace(1) nocapture %out, float %x) #0 { ; SI-LABEL: test_isfinite_pattern_4_commute_and: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0 +; SI-NEXT: v_cmp_class_f32_e32 vcc, s0, v0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_isfinite_pattern_4_commute_and: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0 +; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -534,11 +534,11 @@ define amdgpu_kernel void @test_isfinite_pattern_4_commute_and(ptr addrspace(1) ; GFX11-LABEL: test_isfinite_pattern_4_commute_and: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_class_f32_e64 s2, s4, 0x1f8 +; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x1f8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -557,16 +557,16 @@ define amdgpu_kernel void @test_isfinite_pattern_4_commute_and(ptr addrspace(1) define amdgpu_kernel void @test_not_isfinite_pattern_4_wrong_ord_test(ptr addrspace(1) nocapture %out, float %x, [8 x i32], float %y) #0 { ; SI-LABEL: test_not_isfinite_pattern_4_wrong_ord_test: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0x14 -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; SI-NEXT: s_load_dword s1, s[2:3], 0xb +; SI-NEXT: s_load_dword s2, s[0:1], 0x14 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s0 -; SI-NEXT: v_cmp_o_f32_e32 vcc, s1, v1 -; SI-NEXT: v_cmp_class_f32_e64 s[0:1], s1, v0 +; SI-NEXT: v_mov_b32_e32 v1, s2 +; SI-NEXT: v_cmp_o_f32_e32 vcc, s0, v1 +; SI-NEXT: v_cmp_class_f32_e64 s[0:1], s0, v0 ; SI-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -574,14 +574,14 @@ define amdgpu_kernel void @test_not_isfinite_pattern_4_wrong_ord_test(ptr addrsp ; ; VI-LABEL: test_not_isfinite_pattern_4_wrong_ord_test: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[2:3], 0x50 -; VI-NEXT: s_load_dword s1, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x50 +; VI-NEXT: s_load_dword s5, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_cmp_class_f32_e32 vcc, s1, v0 -; VI-NEXT: v_cmp_o_f32_e64 s[0:1], s1, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_cmp_class_f32_e32 vcc, s5, v0 +; VI-NEXT: v_cmp_o_f32_e64 s[0:1], s5, v1 ; VI-NEXT: s_and_b64 s[0:1], s[0:1], vcc ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] @@ -592,15 +592,15 @@ define amdgpu_kernel void @test_not_isfinite_pattern_4_wrong_ord_test(ptr addrsp ; GFX11-LABEL: test_not_isfinite_pattern_4_wrong_ord_test: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x50 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x50 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_class_f32_e64 s3, s4, 0x1f8 -; GFX11-NEXT: v_cmp_o_f32_e64 s2, s4, s5 +; GFX11-NEXT: v_cmp_o_f32_e64 s3, s2, s3 +; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x1f8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 s2, s2, s3 +; GFX11-NEXT: s_and_b32 s2, s3, s2 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -618,8 +618,8 @@ define amdgpu_kernel void @test_not_isfinite_pattern_4_wrong_ord_test(ptr addrsp define amdgpu_kernel void @test_isinf_pattern_f16(ptr addrspace(1) nocapture %out, half %x) #0 { ; SI-LABEL: test_isinf_pattern_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -632,11 +632,11 @@ define amdgpu_kernel void @test_isinf_pattern_f16(ptr addrspace(1) nocapture %ou ; ; VI-LABEL: test_isinf_pattern_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x204 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_class_f16_e32 vcc, s4, v0 +; VI-NEXT: v_cmp_class_f16_e32 vcc, s2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -646,11 +646,11 @@ define amdgpu_kernel void @test_isinf_pattern_f16(ptr addrspace(1) nocapture %ou ; GFX11-LABEL: test_isinf_pattern_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_class_f16_e64 s2, s4, 0x204 +; GFX11-NEXT: v_cmp_class_f16_e64 s2, s2, 0x204 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -667,8 +667,8 @@ define amdgpu_kernel void @test_isinf_pattern_f16(ptr addrspace(1) nocapture %ou define amdgpu_kernel void @test_isfinite_pattern_0_f16(ptr addrspace(1) nocapture %out, half %x) #0 { ; SI-LABEL: test_isfinite_pattern_0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -684,11 +684,11 @@ define amdgpu_kernel void @test_isfinite_pattern_0_f16(ptr addrspace(1) nocaptur ; ; VI-LABEL: test_isfinite_pattern_0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_class_f16_e32 vcc, s4, v0 +; VI-NEXT: v_cmp_class_f16_e32 vcc, s2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -698,11 +698,11 @@ define amdgpu_kernel void @test_isfinite_pattern_0_f16(ptr addrspace(1) nocaptur ; GFX11-LABEL: test_isfinite_pattern_0_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_class_f16_e64 s2, s4, 0x1f8 +; GFX11-NEXT: v_cmp_class_f16_e64 s2, s2, 0x1f8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -721,8 +721,8 @@ define amdgpu_kernel void @test_isfinite_pattern_0_f16(ptr addrspace(1) nocaptur define amdgpu_kernel void @test_isfinite_pattern_4_f16(ptr addrspace(1) nocapture %out, half %x) #0 { ; SI-LABEL: test_isfinite_pattern_4_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -738,11 +738,11 @@ define amdgpu_kernel void @test_isfinite_pattern_4_f16(ptr addrspace(1) nocaptur ; ; VI-LABEL: test_isfinite_pattern_4_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_class_f16_e32 vcc, s4, v0 +; VI-NEXT: v_cmp_class_f16_e32 vcc, s2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -752,11 +752,11 @@ define amdgpu_kernel void @test_isfinite_pattern_4_f16(ptr addrspace(1) nocaptur ; GFX11-LABEL: test_isfinite_pattern_4_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_class_f16_e64 s2, s4, 0x1f8 +; GFX11-NEXT: v_cmp_class_f16_e64 s2, s2, 0x1f8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll index 105d9246880a4..a6fadbed33c86 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll @@ -20,8 +20,8 @@ declare float @llvm.amdgcn.raw.buffer.atomic.fmax.f32(float, <4 x i32>, i32, i32 define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(<4 x i32> inreg %rsrc, float %data, i32 %vindex) { ; SI-LABEL: raw_buffer_atomic_min_noret_f32: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -30,8 +30,8 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(<4 x i32> inreg %rsrc ; ; GFX7-LABEL: raw_buffer_atomic_min_noret_f32: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -41,19 +41,19 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(<4 x i32> inreg %rsrc ; GFX10-LABEL: raw_buffer_atomic_min_noret_f32: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen ; GFX10-NEXT: s_endpgm ; ; GFX1030-LABEL: raw_buffer_atomic_min_noret_f32: ; GFX1030: ; %bb.0: ; %main_body ; GFX1030-NEXT: s_clause 0x1 -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -63,8 +63,8 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(<4 x i32> inreg %rsrc ; GFX1100-LABEL: raw_buffer_atomic_min_noret_f32: ; GFX1100: ; %bb.0: ; %main_body ; GFX1100-NEXT: s_clause 0x1 -; GFX1100-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX1100-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1100-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen @@ -75,8 +75,8 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(<4 x i32> inreg %rsrc ; GFX12-LABEL: raw_buffer_atomic_min_noret_f32: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen @@ -86,8 +86,8 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(<4 x i32> inreg %rsrc ; ; G_SI-LABEL: raw_buffer_atomic_min_noret_f32: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; G_SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; G_SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; G_SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v0, s4 ; G_SI-NEXT: v_mov_b32_e32 v1, s5 @@ -96,8 +96,8 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(<4 x i32> inreg %rsrc ; ; G_GFX7-LABEL: raw_buffer_atomic_min_noret_f32: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -107,19 +107,19 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(<4 x i32> inreg %rsrc ; G_GFX10-LABEL: raw_buffer_atomic_min_noret_f32: ; G_GFX10: ; %bb.0: ; %main_body ; G_GFX10-NEXT: s_clause 0x1 -; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 +; G_GFX10-NEXT: v_mov_b32_e32 v0, s2 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s3 ; G_GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen ; G_GFX10-NEXT: s_endpgm ; ; G_GFX1030-LABEL: raw_buffer_atomic_min_noret_f32: ; G_GFX1030: ; %bb.0: ; %main_body ; G_GFX1030-NEXT: s_clause 0x1 -; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -129,8 +129,8 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(<4 x i32> inreg %rsrc ; G_GFX1100-LABEL: raw_buffer_atomic_min_noret_f32: ; G_GFX1100: ; %bb.0: ; %main_body ; G_GFX1100-NEXT: s_clause 0x1 -; G_GFX1100-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; G_GFX1100-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; G_GFX1100-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; G_GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; G_GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen @@ -242,15 +242,14 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f32_off4_slc(<4 x i32> inreg %rsrc, float %data, i32 %vindex, ptr addrspace(3) %out) { ; SI-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xf ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v1, s3 ; SI-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc -; SI-NEXT: s_load_dword s0, s[2:3], 0xf -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: ds_write_b32 v1, v0 @@ -258,48 +257,60 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f32_off4_slc(<4 x i32> inre ; ; GFX7-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0xf ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 4 offen glc slc -; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc +; GFX7-NEXT: v_mov_b32_e32 v1, s0 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_write_b32 v1, v0 ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 4 offen glc slc -; GFX10-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x3c +; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ds_write_b32 v1, v0 ; GFX10-NEXT: s_endpgm ; ; GFX1030-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1030-NEXT: s_clause 0x2 +; GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1030-NEXT: s_load_dword s0, s[0:1], 0x3c ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v0, s4 -; GFX1030-NEXT: v_mov_b32_e32 v1, s5 -; GFX1030-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 4 offen glc slc -; GFX1030-NEXT: v_mov_b32_e32 v1, s6 +; GFX1030-NEXT: v_mov_b32_e32 v0, s2 +; GFX1030-NEXT: v_mov_b32_e32 v1, s3 +; GFX1030-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc +; GFX1030-NEXT: v_mov_b32_e32 v1, s0 ; GFX1030-NEXT: s_waitcnt vmcnt(0) ; GFX1030-NEXT: ds_write_b32 v1, v0 ; GFX1030-NEXT: s_endpgm ; ; GFX1100-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; GFX1100: ; %bb.0: ; %main_body -; GFX1100-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX1100-NEXT: s_clause 0x2 +; GFX1100-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 +; GFX1100-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1100-NEXT: s_load_b32 s0, s[0:1], 0x3c ; GFX1100-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 4 offen glc slc -; GFX1100-NEXT: v_mov_b32_e32 v1, s6 +; GFX1100-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[4:7], 4 offen glc slc +; GFX1100-NEXT: v_mov_b32_e32 v1, s0 ; GFX1100-NEXT: s_waitcnt vmcnt(0) ; GFX1100-NEXT: ds_store_b32 v1, v0 ; GFX1100-NEXT: s_endpgm @@ -307,8 +318,8 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f32_off4_slc(<4 x i32> inre ; GFX12-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b96 s[4:6], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b96 s[4:6], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: s_mov_b32 s4, 4 @@ -320,15 +331,14 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f32_off4_slc(<4 x i32> inre ; ; G_SI-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; G_SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; G_SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; G_SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; G_SI-NEXT: s_load_dword s0, s[0:1], 0xf ; G_SI-NEXT: s_mov_b32 m0, -1 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) -; G_SI-NEXT: v_mov_b32_e32 v0, s0 -; G_SI-NEXT: v_mov_b32_e32 v1, s1 +; G_SI-NEXT: v_mov_b32_e32 v0, s2 +; G_SI-NEXT: v_mov_b32_e32 v1, s3 ; G_SI-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc -; G_SI-NEXT: s_load_dword s0, s[2:3], 0xf -; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v1, s0 ; G_SI-NEXT: s_waitcnt vmcnt(0) ; G_SI-NEXT: ds_write_b32 v1, v0 @@ -336,15 +346,14 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f32_off4_slc(<4 x i32> inre ; ; G_GFX7-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; G_GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; G_GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; G_GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; G_GFX7-NEXT: s_load_dword s0, s[0:1], 0xf ; G_GFX7-NEXT: s_mov_b32 m0, -1 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX7-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX7-NEXT: v_mov_b32_e32 v1, s1 +; G_GFX7-NEXT: v_mov_b32_e32 v0, s2 +; G_GFX7-NEXT: v_mov_b32_e32 v1, s3 ; G_GFX7-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc -; G_GFX7-NEXT: s_load_dword s0, s[2:3], 0xf -; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v1, s0 ; G_GFX7-NEXT: s_waitcnt vmcnt(0) ; G_GFX7-NEXT: ds_write_b32 v1, v0 @@ -353,12 +362,12 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f32_off4_slc(<4 x i32> inre ; G_GFX10-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; G_GFX10: ; %bb.0: ; %main_body ; G_GFX10-NEXT: s_clause 0x1 -; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 -; G_GFX10-NEXT: s_load_dword s0, s[2:3], 0x3c +; G_GFX10-NEXT: v_mov_b32_e32 v0, s2 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s3 +; G_GFX10-NEXT: s_load_dword s0, s[0:1], 0x3c ; G_GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX10-NEXT: v_mov_b32_e32 v1, s0 @@ -368,15 +377,14 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f32_off4_slc(<4 x i32> inre ; ; G_GFX1030-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; G_GFX1030: ; %bb.0: ; %main_body -; G_GFX1030-NEXT: s_clause 0x1 -; G_GFX1030-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; G_GFX1030-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; G_GFX1030-NEXT: s_clause 0x2 +; G_GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; G_GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; G_GFX1030-NEXT: s_load_dword s0, s[0:1], 0x3c ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX1030-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX1030-NEXT: v_mov_b32_e32 v1, s1 -; G_GFX1030-NEXT: s_load_dword s0, s[2:3], 0x3c +; G_GFX1030-NEXT: v_mov_b32_e32 v0, s2 +; G_GFX1030-NEXT: v_mov_b32_e32 v1, s3 ; G_GFX1030-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc -; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s0 ; G_GFX1030-NEXT: s_waitcnt vmcnt(0) ; G_GFX1030-NEXT: ds_write_b32 v1, v0 @@ -384,14 +392,13 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f32_off4_slc(<4 x i32> inre ; ; G_GFX1100-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; G_GFX1100: ; %bb.0: ; %main_body -; G_GFX1100-NEXT: s_clause 0x1 -; G_GFX1100-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; G_GFX1100-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; G_GFX1100-NEXT: s_clause 0x2 +; G_GFX1100-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 +; G_GFX1100-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; G_GFX1100-NEXT: s_load_b32 s0, s[0:1], 0x3c ; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX1100-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; G_GFX1100-NEXT: s_load_b32 s0, s[2:3], 0x3c +; G_GFX1100-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; G_GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[4:7], 4 offen glc slc -; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1100-NEXT: v_mov_b32_e32 v1, s0 ; G_GFX1100-NEXT: s_waitcnt vmcnt(0) ; G_GFX1100-NEXT: ds_store_b32 v1, v0 @@ -405,8 +412,8 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(<4 x i32> inreg %rsrc, float %data, i32 %vindex) { ; SI-LABEL: raw_buffer_atomic_max_noret_f32: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -415,8 +422,8 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(<4 x i32> inreg %rsrc ; ; GFX7-LABEL: raw_buffer_atomic_max_noret_f32: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -426,19 +433,19 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(<4 x i32> inreg %rsrc ; GFX10-LABEL: raw_buffer_atomic_max_noret_f32: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen ; GFX10-NEXT: s_endpgm ; ; GFX1030-LABEL: raw_buffer_atomic_max_noret_f32: ; GFX1030: ; %bb.0: ; %main_body ; GFX1030-NEXT: s_clause 0x1 -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -448,8 +455,8 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(<4 x i32> inreg %rsrc ; GFX1100-LABEL: raw_buffer_atomic_max_noret_f32: ; GFX1100: ; %bb.0: ; %main_body ; GFX1100-NEXT: s_clause 0x1 -; GFX1100-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX1100-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1100-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen @@ -460,8 +467,8 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(<4 x i32> inreg %rsrc ; GFX12-LABEL: raw_buffer_atomic_max_noret_f32: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen @@ -471,8 +478,8 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(<4 x i32> inreg %rsrc ; ; G_SI-LABEL: raw_buffer_atomic_max_noret_f32: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; G_SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; G_SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; G_SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v0, s4 ; G_SI-NEXT: v_mov_b32_e32 v1, s5 @@ -481,8 +488,8 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(<4 x i32> inreg %rsrc ; ; G_GFX7-LABEL: raw_buffer_atomic_max_noret_f32: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -492,19 +499,19 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(<4 x i32> inreg %rsrc ; G_GFX10-LABEL: raw_buffer_atomic_max_noret_f32: ; G_GFX10: ; %bb.0: ; %main_body ; G_GFX10-NEXT: s_clause 0x1 -; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 +; G_GFX10-NEXT: v_mov_b32_e32 v0, s2 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s3 ; G_GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen ; G_GFX10-NEXT: s_endpgm ; ; G_GFX1030-LABEL: raw_buffer_atomic_max_noret_f32: ; G_GFX1030: ; %bb.0: ; %main_body ; G_GFX1030-NEXT: s_clause 0x1 -; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -514,8 +521,8 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(<4 x i32> inreg %rsrc ; G_GFX1100-LABEL: raw_buffer_atomic_max_noret_f32: ; G_GFX1100: ; %bb.0: ; %main_body ; G_GFX1100-NEXT: s_clause 0x1 -; G_GFX1100-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; G_GFX1100-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; G_GFX1100-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; G_GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; G_GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen @@ -627,7 +634,7 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inreg %rsrc, float %data, i32 %vindex, ptr addrspace(1) %out) { ; SI-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -642,7 +649,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre ; ; GFX7-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -657,19 +664,19 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre ; ; GFX10-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 4 offen glc slc +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: v_mov_b32_e32 v1, s9 +; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_store_dword v1, v0, s[6:7] +; GFX10-NEXT: global_store_dword v1, v0, s[10:11] ; GFX10-NEXT: s_endpgm ; ; GFX1030-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -681,7 +688,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre ; ; GFX1100-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; GFX1100: ; %bb.0: ; %main_body -; GFX1100-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX1100-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 4 offen glc slc @@ -694,7 +701,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre ; ; GFX12-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; GFX12: ; %bb.0: ; %main_body -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: s_mov_b32 s4, 4 @@ -708,7 +715,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre ; ; G_SI-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; G_SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v0, s4 ; G_SI-NEXT: v_mov_b32_e32 v1, s5 @@ -722,7 +729,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre ; ; G_GFX7-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; G_GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -736,19 +743,19 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre ; ; G_GFX10-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; G_GFX10: ; %bb.0: ; %main_body -; G_GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; G_GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s4 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s5 -; G_GFX10-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 4 offen glc slc +; G_GFX10-NEXT: v_mov_b32_e32 v0, s8 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s9 +; G_GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc ; G_GFX10-NEXT: v_mov_b32_e32 v1, 0 ; G_GFX10-NEXT: s_waitcnt vmcnt(0) -; G_GFX10-NEXT: global_store_dword v1, v0, s[6:7] +; G_GFX10-NEXT: global_store_dword v1, v0, s[10:11] ; G_GFX10-NEXT: s_endpgm ; ; G_GFX1030-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; G_GFX1030: ; %bb.0: ; %main_body -; G_GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; G_GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -760,7 +767,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre ; ; G_GFX1100-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; G_GFX1100: ; %bb.0: ; %main_body -; G_GFX1100-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; G_GFX1100-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; G_GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 4 offen glc slc diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll index e124aadf4e8c2..ea26ad06cbb16 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll @@ -18,8 +18,8 @@ declare float @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f32(float, ptr addrspace(8 define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f32(ptr addrspace(8) inreg %rsrc, float %data, i32 %vindex) { ; SI-LABEL: raw_ptr_buffer_atomic_min_noret_f32: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -28,8 +28,8 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f32(ptr addrspace(8) ; ; GFX7-LABEL: raw_ptr_buffer_atomic_min_noret_f32: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -39,19 +39,19 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f32(ptr addrspace(8) ; GFX10-LABEL: raw_ptr_buffer_atomic_min_noret_f32: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen ; GFX10-NEXT: s_endpgm ; ; GFX1030-LABEL: raw_ptr_buffer_atomic_min_noret_f32: ; GFX1030: ; %bb.0: ; %main_body ; GFX1030-NEXT: s_clause 0x1 -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -61,8 +61,8 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f32(ptr addrspace(8) ; GFX1100-LABEL: raw_ptr_buffer_atomic_min_noret_f32: ; GFX1100: ; %bb.0: ; %main_body ; GFX1100-NEXT: s_clause 0x1 -; GFX1100-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX1100-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1100-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen @@ -72,8 +72,8 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f32(ptr addrspace(8) ; ; G_SI-LABEL: raw_ptr_buffer_atomic_min_noret_f32: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; G_SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; G_SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; G_SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v0, s4 ; G_SI-NEXT: v_mov_b32_e32 v1, s5 @@ -82,8 +82,8 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f32(ptr addrspace(8) ; ; G_GFX7-LABEL: raw_ptr_buffer_atomic_min_noret_f32: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -93,19 +93,19 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f32(ptr addrspace(8) ; G_GFX10-LABEL: raw_ptr_buffer_atomic_min_noret_f32: ; G_GFX10: ; %bb.0: ; %main_body ; G_GFX10-NEXT: s_clause 0x1 -; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 +; G_GFX10-NEXT: v_mov_b32_e32 v0, s2 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s3 ; G_GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen ; G_GFX10-NEXT: s_endpgm ; ; G_GFX1030-LABEL: raw_ptr_buffer_atomic_min_noret_f32: ; G_GFX1030: ; %bb.0: ; %main_body ; G_GFX1030-NEXT: s_clause 0x1 -; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -115,8 +115,8 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f32(ptr addrspace(8) ; G_GFX1100-LABEL: raw_ptr_buffer_atomic_min_noret_f32: ; G_GFX1100: ; %bb.0: ; %main_body ; G_GFX1100-NEXT: s_clause 0x1 -; G_GFX1100-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; G_GFX1100-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; G_GFX1100-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; G_GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; G_GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen @@ -219,15 +219,14 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f32_off4_slc(ptr addrspace(8) inreg %rsrc, float %data, i32 %vindex, ptr addrspace(3) %out) { ; SI-LABEL: raw_ptr_buffer_atomic_min_rtn_f32_off4_slc: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xf ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v1, s3 ; SI-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc -; SI-NEXT: s_load_dword s0, s[2:3], 0xf -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: ds_write_b32 v1, v0 @@ -235,63 +234,74 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f32_off4_slc(ptr addrsp ; ; GFX7-LABEL: raw_ptr_buffer_atomic_min_rtn_f32_off4_slc: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0xf ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 4 offen glc slc -; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc +; GFX7-NEXT: v_mov_b32_e32 v1, s0 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_write_b32 v1, v0 ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: raw_ptr_buffer_atomic_min_rtn_f32_off4_slc: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 4 offen glc slc -; GFX10-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x3c +; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ds_write_b32 v1, v0 ; GFX10-NEXT: s_endpgm ; ; GFX1030-LABEL: raw_ptr_buffer_atomic_min_rtn_f32_off4_slc: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1030-NEXT: s_clause 0x2 +; GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1030-NEXT: s_load_dword s0, s[0:1], 0x3c ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v0, s4 -; GFX1030-NEXT: v_mov_b32_e32 v1, s5 -; GFX1030-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 4 offen glc slc -; GFX1030-NEXT: v_mov_b32_e32 v1, s6 +; GFX1030-NEXT: v_mov_b32_e32 v0, s2 +; GFX1030-NEXT: v_mov_b32_e32 v1, s3 +; GFX1030-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc +; GFX1030-NEXT: v_mov_b32_e32 v1, s0 ; GFX1030-NEXT: s_waitcnt vmcnt(0) ; GFX1030-NEXT: ds_write_b32 v1, v0 ; GFX1030-NEXT: s_endpgm ; ; GFX1100-LABEL: raw_ptr_buffer_atomic_min_rtn_f32_off4_slc: ; GFX1100: ; %bb.0: ; %main_body -; GFX1100-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX1100-NEXT: s_clause 0x2 +; GFX1100-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 +; GFX1100-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1100-NEXT: s_load_b32 s0, s[0:1], 0x3c ; GFX1100-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 4 offen glc slc -; GFX1100-NEXT: v_mov_b32_e32 v1, s6 +; GFX1100-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[4:7], 4 offen glc slc +; GFX1100-NEXT: v_mov_b32_e32 v1, s0 ; GFX1100-NEXT: s_waitcnt vmcnt(0) ; GFX1100-NEXT: ds_store_b32 v1, v0 ; GFX1100-NEXT: s_endpgm ; ; G_SI-LABEL: raw_ptr_buffer_atomic_min_rtn_f32_off4_slc: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; G_SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; G_SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; G_SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; G_SI-NEXT: s_load_dword s0, s[0:1], 0xf ; G_SI-NEXT: s_mov_b32 m0, -1 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) -; G_SI-NEXT: v_mov_b32_e32 v0, s0 -; G_SI-NEXT: v_mov_b32_e32 v1, s1 +; G_SI-NEXT: v_mov_b32_e32 v0, s2 +; G_SI-NEXT: v_mov_b32_e32 v1, s3 ; G_SI-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc -; G_SI-NEXT: s_load_dword s0, s[2:3], 0xf -; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v1, s0 ; G_SI-NEXT: s_waitcnt vmcnt(0) ; G_SI-NEXT: ds_write_b32 v1, v0 @@ -299,15 +309,14 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f32_off4_slc(ptr addrsp ; ; G_GFX7-LABEL: raw_ptr_buffer_atomic_min_rtn_f32_off4_slc: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; G_GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; G_GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; G_GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; G_GFX7-NEXT: s_load_dword s0, s[0:1], 0xf ; G_GFX7-NEXT: s_mov_b32 m0, -1 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX7-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX7-NEXT: v_mov_b32_e32 v1, s1 +; G_GFX7-NEXT: v_mov_b32_e32 v0, s2 +; G_GFX7-NEXT: v_mov_b32_e32 v1, s3 ; G_GFX7-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc -; G_GFX7-NEXT: s_load_dword s0, s[2:3], 0xf -; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v1, s0 ; G_GFX7-NEXT: s_waitcnt vmcnt(0) ; G_GFX7-NEXT: ds_write_b32 v1, v0 @@ -316,12 +325,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f32_off4_slc(ptr addrsp ; G_GFX10-LABEL: raw_ptr_buffer_atomic_min_rtn_f32_off4_slc: ; G_GFX10: ; %bb.0: ; %main_body ; G_GFX10-NEXT: s_clause 0x1 -; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 -; G_GFX10-NEXT: s_load_dword s0, s[2:3], 0x3c +; G_GFX10-NEXT: v_mov_b32_e32 v0, s2 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s3 +; G_GFX10-NEXT: s_load_dword s0, s[0:1], 0x3c ; G_GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX10-NEXT: v_mov_b32_e32 v1, s0 @@ -331,15 +340,14 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f32_off4_slc(ptr addrsp ; ; G_GFX1030-LABEL: raw_ptr_buffer_atomic_min_rtn_f32_off4_slc: ; G_GFX1030: ; %bb.0: ; %main_body -; G_GFX1030-NEXT: s_clause 0x1 -; G_GFX1030-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; G_GFX1030-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; G_GFX1030-NEXT: s_clause 0x2 +; G_GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; G_GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; G_GFX1030-NEXT: s_load_dword s0, s[0:1], 0x3c ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX1030-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX1030-NEXT: v_mov_b32_e32 v1, s1 -; G_GFX1030-NEXT: s_load_dword s0, s[2:3], 0x3c +; G_GFX1030-NEXT: v_mov_b32_e32 v0, s2 +; G_GFX1030-NEXT: v_mov_b32_e32 v1, s3 ; G_GFX1030-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc -; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s0 ; G_GFX1030-NEXT: s_waitcnt vmcnt(0) ; G_GFX1030-NEXT: ds_write_b32 v1, v0 @@ -347,14 +355,13 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f32_off4_slc(ptr addrsp ; ; G_GFX1100-LABEL: raw_ptr_buffer_atomic_min_rtn_f32_off4_slc: ; G_GFX1100: ; %bb.0: ; %main_body -; G_GFX1100-NEXT: s_clause 0x1 -; G_GFX1100-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; G_GFX1100-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; G_GFX1100-NEXT: s_clause 0x2 +; G_GFX1100-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 +; G_GFX1100-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; G_GFX1100-NEXT: s_load_b32 s0, s[0:1], 0x3c ; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX1100-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; G_GFX1100-NEXT: s_load_b32 s0, s[2:3], 0x3c +; G_GFX1100-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; G_GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[4:7], 4 offen glc slc -; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1100-NEXT: v_mov_b32_e32 v1, s0 ; G_GFX1100-NEXT: s_waitcnt vmcnt(0) ; G_GFX1100-NEXT: ds_store_b32 v1, v0 @@ -369,8 +376,8 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f32(ptr addrspace(8) inreg %rsrc, float %data, i32 %vindex) { ; SI-LABEL: raw_ptr_buffer_atomic_max_noret_f32: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -379,8 +386,8 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f32(ptr addrspace(8) ; ; GFX7-LABEL: raw_ptr_buffer_atomic_max_noret_f32: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -390,19 +397,19 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f32(ptr addrspace(8) ; GFX10-LABEL: raw_ptr_buffer_atomic_max_noret_f32: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen ; GFX10-NEXT: s_endpgm ; ; GFX1030-LABEL: raw_ptr_buffer_atomic_max_noret_f32: ; GFX1030: ; %bb.0: ; %main_body ; GFX1030-NEXT: s_clause 0x1 -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -412,8 +419,8 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f32(ptr addrspace(8) ; GFX1100-LABEL: raw_ptr_buffer_atomic_max_noret_f32: ; GFX1100: ; %bb.0: ; %main_body ; GFX1100-NEXT: s_clause 0x1 -; GFX1100-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX1100-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1100-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen @@ -423,8 +430,8 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f32(ptr addrspace(8) ; ; G_SI-LABEL: raw_ptr_buffer_atomic_max_noret_f32: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; G_SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; G_SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; G_SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v0, s4 ; G_SI-NEXT: v_mov_b32_e32 v1, s5 @@ -433,8 +440,8 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f32(ptr addrspace(8) ; ; G_GFX7-LABEL: raw_ptr_buffer_atomic_max_noret_f32: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -444,19 +451,19 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f32(ptr addrspace(8) ; G_GFX10-LABEL: raw_ptr_buffer_atomic_max_noret_f32: ; G_GFX10: ; %bb.0: ; %main_body ; G_GFX10-NEXT: s_clause 0x1 -; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 +; G_GFX10-NEXT: v_mov_b32_e32 v0, s2 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s3 ; G_GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen ; G_GFX10-NEXT: s_endpgm ; ; G_GFX1030-LABEL: raw_ptr_buffer_atomic_max_noret_f32: ; G_GFX1030: ; %bb.0: ; %main_body ; G_GFX1030-NEXT: s_clause 0x1 -; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -466,8 +473,8 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f32(ptr addrspace(8) ; G_GFX1100-LABEL: raw_ptr_buffer_atomic_max_noret_f32: ; G_GFX1100: ; %bb.0: ; %main_body ; G_GFX1100-NEXT: s_clause 0x1 -; G_GFX1100-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; G_GFX1100-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; G_GFX1100-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; G_GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; G_GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen @@ -570,7 +577,7 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrspace(8) inreg %rsrc, float %data, i32 %vindex, ptr addrspace(1) %out) { ; SI-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -585,7 +592,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrsp ; ; GFX7-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -600,19 +607,19 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrsp ; ; GFX10-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 4 offen glc slc +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: v_mov_b32_e32 v1, s9 +; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_store_dword v1, v0, s[6:7] +; GFX10-NEXT: global_store_dword v1, v0, s[10:11] ; GFX10-NEXT: s_endpgm ; ; GFX1030-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -624,7 +631,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrsp ; ; GFX1100-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc: ; GFX1100: ; %bb.0: ; %main_body -; GFX1100-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX1100-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 4 offen glc slc @@ -637,7 +644,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrsp ; ; G_SI-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; G_SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v0, s4 ; G_SI-NEXT: v_mov_b32_e32 v1, s5 @@ -651,7 +658,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrsp ; ; G_GFX7-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; G_GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -665,19 +672,19 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrsp ; ; G_GFX10-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc: ; G_GFX10: ; %bb.0: ; %main_body -; G_GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; G_GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s4 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s5 -; G_GFX10-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 4 offen glc slc +; G_GFX10-NEXT: v_mov_b32_e32 v0, s8 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s9 +; G_GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc ; G_GFX10-NEXT: v_mov_b32_e32 v1, 0 ; G_GFX10-NEXT: s_waitcnt vmcnt(0) -; G_GFX10-NEXT: global_store_dword v1, v0, s[6:7] +; G_GFX10-NEXT: global_store_dword v1, v0, s[10:11] ; G_GFX10-NEXT: s_endpgm ; ; G_GFX1030-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc: ; G_GFX1030: ; %bb.0: ; %main_body -; G_GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; G_GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -689,7 +696,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrsp ; ; G_GFX1100-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc: ; G_GFX1100: ; %bb.0: ; %main_body -; G_GFX1100-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; G_GFX1100-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; G_GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 4 offen glc slc diff --git a/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll b/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll index 81859dce04889..d827ea0503a3b 100644 --- a/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll +++ b/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll @@ -10,7 +10,7 @@ declare float @llvm.convert.from.fp16.f32(i16) nounwind readnone define amdgpu_kernel void @test_convert_fp16_to_fp32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; GFX6-LABEL: test_convert_fp16_to_fp32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -28,7 +28,7 @@ define amdgpu_kernel void @test_convert_fp16_to_fp32(ptr addrspace(1) noalias %o ; ; GFX8-LABEL: test_convert_fp16_to_fp32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_mov_b32 s10, s6 @@ -46,7 +46,7 @@ define amdgpu_kernel void @test_convert_fp16_to_fp32(ptr addrspace(1) noalias %o ; ; GFX11-LABEL: test_convert_fp16_to_fp32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 diff --git a/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll b/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll index c17be87834aeb..03b8251ea4640 100644 --- a/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll +++ b/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll @@ -8,7 +8,7 @@ declare double @llvm.convert.from.fp16.f64(i16) nounwind readnone define amdgpu_kernel void @test_convert_fp16_to_fp64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; GFX6-LABEL: test_convert_fp16_to_fp64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -27,7 +27,7 @@ define amdgpu_kernel void @test_convert_fp16_to_fp64(ptr addrspace(1) noalias %o ; ; GFX8-LABEL: test_convert_fp16_to_fp64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_mov_b32 s10, s6 @@ -46,7 +46,7 @@ define amdgpu_kernel void @test_convert_fp16_to_fp64(ptr addrspace(1) noalias %o ; ; GFX11-LABEL: test_convert_fp16_to_fp64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 diff --git a/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll b/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll index d8a726f251a01..8ab82b722445e 100644 --- a/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll +++ b/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll @@ -9,7 +9,7 @@ declare i16 @llvm.convert.to.fp16.f32(float) nounwind readnone define amdgpu_kernel void @test_convert_fp32_to_fp16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; GFX6-LABEL: test_convert_fp32_to_fp16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -27,7 +27,7 @@ define amdgpu_kernel void @test_convert_fp32_to_fp16(ptr addrspace(1) noalias %o ; ; GFX8-LABEL: test_convert_fp32_to_fp16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_mov_b32 s10, s6 @@ -45,7 +45,7 @@ define amdgpu_kernel void @test_convert_fp32_to_fp16(ptr addrspace(1) noalias %o ; ; GFX11-LABEL: test_convert_fp32_to_fp16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 diff --git a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll index f18f5752269e0..47164c9d32a8f 100644 --- a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll @@ -16,9 +16,9 @@ declare double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double, <4 x i32>, i32, i define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { ; SI-LABEL: raw_buffer_atomic_min_noret_f64: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xf -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xf +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -28,9 +28,9 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc ; ; GFX7-LABEL: raw_buffer_atomic_min_noret_f64: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GFX7-NEXT: s_load_dword s6, s[2:3], 0xf -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_load_dword s6, s[0:1], 0xf +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -41,12 +41,12 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc ; GFX10-LABEL: raw_buffer_atomic_min_noret_f64: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: v_mov_b32_e32 v2, s8 ; GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen ; GFX10-NEXT: s_endpgm @@ -54,9 +54,9 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc ; GFX1030-LABEL: raw_buffer_atomic_min_noret_f64: ; GFX1030: ; %bb.0: ; %main_body ; GFX1030-NEXT: s_clause 0x2 -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX1030-NEXT: s_load_dword s6, s[2:3], 0x3c -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -66,9 +66,9 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc ; ; G_SI-LABEL: raw_buffer_atomic_min_noret_f64: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; G_SI-NEXT: s_load_dword s6, s[2:3], 0xf -; G_SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; G_SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; G_SI-NEXT: s_load_dword s6, s[0:1], 0xf +; G_SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v0, s4 ; G_SI-NEXT: v_mov_b32_e32 v1, s5 @@ -78,9 +78,9 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc ; ; G_GFX7-LABEL: raw_buffer_atomic_min_noret_f64: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; G_GFX7-NEXT: s_load_dword s6, s[2:3], 0xf -; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; G_GFX7-NEXT: s_load_dword s6, s[0:1], 0xf +; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -91,12 +91,12 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc ; G_GFX10-LABEL: raw_buffer_atomic_min_noret_f64: ; G_GFX10: ; %bb.0: ; %main_body ; G_GFX10-NEXT: s_clause 0x2 -; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; G_GFX10-NEXT: s_load_dword s8, s[2:3], 0x3c -; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; G_GFX10-NEXT: s_load_dword s8, s[0:1], 0x3c +; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 +; G_GFX10-NEXT: v_mov_b32_e32 v0, s2 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s3 ; G_GFX10-NEXT: v_mov_b32_e32 v2, s8 ; G_GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen ; G_GFX10-NEXT: s_endpgm @@ -104,9 +104,9 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc ; G_GFX1030-LABEL: raw_buffer_atomic_min_noret_f64: ; G_GFX1030: ; %bb.0: ; %main_body ; G_GFX1030-NEXT: s_clause 0x2 -; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; G_GFX1030-NEXT: s_load_dword s6, s[2:3], 0x3c -; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; G_GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c +; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -253,9 +253,9 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { ; SI-LABEL: raw_buffer_atomic_max_noret_f64: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xf -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xf +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -265,9 +265,9 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc ; ; GFX7-LABEL: raw_buffer_atomic_max_noret_f64: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GFX7-NEXT: s_load_dword s6, s[2:3], 0xf -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_load_dword s6, s[0:1], 0xf +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -278,12 +278,12 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc ; GFX10-LABEL: raw_buffer_atomic_max_noret_f64: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: v_mov_b32_e32 v2, s8 ; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen ; GFX10-NEXT: s_endpgm @@ -291,9 +291,9 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc ; GFX1030-LABEL: raw_buffer_atomic_max_noret_f64: ; GFX1030: ; %bb.0: ; %main_body ; GFX1030-NEXT: s_clause 0x2 -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX1030-NEXT: s_load_dword s6, s[2:3], 0x3c -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -303,9 +303,9 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc ; ; G_SI-LABEL: raw_buffer_atomic_max_noret_f64: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; G_SI-NEXT: s_load_dword s6, s[2:3], 0xf -; G_SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; G_SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; G_SI-NEXT: s_load_dword s6, s[0:1], 0xf +; G_SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v0, s4 ; G_SI-NEXT: v_mov_b32_e32 v1, s5 @@ -315,9 +315,9 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc ; ; G_GFX7-LABEL: raw_buffer_atomic_max_noret_f64: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; G_GFX7-NEXT: s_load_dword s6, s[2:3], 0xf -; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; G_GFX7-NEXT: s_load_dword s6, s[0:1], 0xf +; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -328,12 +328,12 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc ; G_GFX10-LABEL: raw_buffer_atomic_max_noret_f64: ; G_GFX10: ; %bb.0: ; %main_body ; G_GFX10-NEXT: s_clause 0x2 -; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; G_GFX10-NEXT: s_load_dword s8, s[2:3], 0x3c -; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; G_GFX10-NEXT: s_load_dword s8, s[0:1], 0x3c +; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 +; G_GFX10-NEXT: v_mov_b32_e32 v0, s2 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s3 ; G_GFX10-NEXT: v_mov_b32_e32 v2, s8 ; G_GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen ; G_GFX10-NEXT: s_endpgm @@ -341,9 +341,9 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc ; G_GFX1030-LABEL: raw_buffer_atomic_max_noret_f64: ; G_GFX1030: ; %bb.0: ; %main_body ; G_GFX1030-NEXT: s_clause 0x2 -; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; G_GFX1030-NEXT: s_load_dword s6, s[2:3], 0x3c -; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; G_GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c +; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -424,7 +424,7 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, ptr addrspace(3) %out) { ; SI-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -438,7 +438,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inre ; ; GFX7-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -452,20 +452,20 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inre ; ; GFX10-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 4 offen glc slc -; GFX10-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: v_mov_b32_e32 v1, s9 +; GFX10-NEXT: v_mov_b32_e32 v2, s10 +; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc +; GFX10-NEXT: v_mov_b32_e32 v2, s11 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ds_write_b64 v2, v[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX1030-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -478,7 +478,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inre ; ; G_SI-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; G_SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; G_SI-NEXT: s_mov_b32 m0, -1 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v0, s4 @@ -492,7 +492,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inre ; ; G_GFX7-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; G_GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; G_GFX7-NEXT: s_mov_b32 m0, -1 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -506,20 +506,20 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inre ; ; G_GFX10-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; G_GFX10: ; %bb.0: ; %main_body -; G_GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; G_GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s4 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s5 -; G_GFX10-NEXT: v_mov_b32_e32 v2, s6 -; G_GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 4 offen glc slc -; G_GFX10-NEXT: v_mov_b32_e32 v2, s7 +; G_GFX10-NEXT: v_mov_b32_e32 v0, s8 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s9 +; G_GFX10-NEXT: v_mov_b32_e32 v2, s10 +; G_GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc +; G_GFX10-NEXT: v_mov_b32_e32 v2, s11 ; G_GFX10-NEXT: s_waitcnt vmcnt(0) ; G_GFX10-NEXT: ds_write_b64 v2, v[0:1] ; G_GFX10-NEXT: s_endpgm ; ; G_GFX1030-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; G_GFX1030: ; %bb.0: ; %main_body -; G_GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; G_GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 diff --git a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll index 6a2a8c3ce595d..3321948192bda 100644 --- a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll @@ -16,9 +16,9 @@ declare double @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f64(double, ptr addrspace define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) { ; SI-LABEL: raw_ptr_buffer_atomic_min_noret_f64: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xf -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xf +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -28,9 +28,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) ; ; GFX7-LABEL: raw_ptr_buffer_atomic_min_noret_f64: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GFX7-NEXT: s_load_dword s6, s[2:3], 0xf -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_load_dword s6, s[0:1], 0xf +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -41,12 +41,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) ; GFX10-LABEL: raw_ptr_buffer_atomic_min_noret_f64: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: v_mov_b32_e32 v2, s8 ; GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen ; GFX10-NEXT: s_endpgm @@ -54,9 +54,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) ; GFX1030-LABEL: raw_ptr_buffer_atomic_min_noret_f64: ; GFX1030: ; %bb.0: ; %main_body ; GFX1030-NEXT: s_clause 0x2 -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX1030-NEXT: s_load_dword s6, s[2:3], 0x3c -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -66,9 +66,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) ; ; G_SI-LABEL: raw_ptr_buffer_atomic_min_noret_f64: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; G_SI-NEXT: s_load_dword s6, s[2:3], 0xf -; G_SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; G_SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; G_SI-NEXT: s_load_dword s6, s[0:1], 0xf +; G_SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v0, s4 ; G_SI-NEXT: v_mov_b32_e32 v1, s5 @@ -78,9 +78,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) ; ; G_GFX7-LABEL: raw_ptr_buffer_atomic_min_noret_f64: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; G_GFX7-NEXT: s_load_dword s6, s[2:3], 0xf -; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; G_GFX7-NEXT: s_load_dword s6, s[0:1], 0xf +; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -91,12 +91,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) ; G_GFX10-LABEL: raw_ptr_buffer_atomic_min_noret_f64: ; G_GFX10: ; %bb.0: ; %main_body ; G_GFX10-NEXT: s_clause 0x2 -; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; G_GFX10-NEXT: s_load_dword s8, s[2:3], 0x3c -; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; G_GFX10-NEXT: s_load_dword s8, s[0:1], 0x3c +; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 +; G_GFX10-NEXT: v_mov_b32_e32 v0, s2 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s3 ; G_GFX10-NEXT: v_mov_b32_e32 v2, s8 ; G_GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen ; G_GFX10-NEXT: s_endpgm @@ -104,9 +104,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) ; G_GFX1030-LABEL: raw_ptr_buffer_atomic_min_noret_f64: ; G_GFX1030: ; %bb.0: ; %main_body ; G_GFX1030-NEXT: s_clause 0x2 -; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; G_GFX1030-NEXT: s_load_dword s6, s[2:3], 0x3c -; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; G_GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c +; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -253,9 +253,9 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) { ; SI-LABEL: raw_ptr_buffer_atomic_max_noret_f64: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xf -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xf +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -265,9 +265,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) ; ; GFX7-LABEL: raw_ptr_buffer_atomic_max_noret_f64: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GFX7-NEXT: s_load_dword s6, s[2:3], 0xf -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_load_dword s6, s[0:1], 0xf +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -278,12 +278,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) ; GFX10-LABEL: raw_ptr_buffer_atomic_max_noret_f64: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: v_mov_b32_e32 v2, s8 ; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen ; GFX10-NEXT: s_endpgm @@ -291,9 +291,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) ; GFX1030-LABEL: raw_ptr_buffer_atomic_max_noret_f64: ; GFX1030: ; %bb.0: ; %main_body ; GFX1030-NEXT: s_clause 0x2 -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX1030-NEXT: s_load_dword s6, s[2:3], 0x3c -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -303,9 +303,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) ; ; G_SI-LABEL: raw_ptr_buffer_atomic_max_noret_f64: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; G_SI-NEXT: s_load_dword s6, s[2:3], 0xf -; G_SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; G_SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; G_SI-NEXT: s_load_dword s6, s[0:1], 0xf +; G_SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v0, s4 ; G_SI-NEXT: v_mov_b32_e32 v1, s5 @@ -315,9 +315,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) ; ; G_GFX7-LABEL: raw_ptr_buffer_atomic_max_noret_f64: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; G_GFX7-NEXT: s_load_dword s6, s[2:3], 0xf -; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; G_GFX7-NEXT: s_load_dword s6, s[0:1], 0xf +; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -328,12 +328,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) ; G_GFX10-LABEL: raw_ptr_buffer_atomic_max_noret_f64: ; G_GFX10: ; %bb.0: ; %main_body ; G_GFX10-NEXT: s_clause 0x2 -; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; G_GFX10-NEXT: s_load_dword s8, s[2:3], 0x3c -; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; G_GFX10-NEXT: s_load_dword s8, s[0:1], 0x3c +; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 +; G_GFX10-NEXT: v_mov_b32_e32 v0, s2 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s3 ; G_GFX10-NEXT: v_mov_b32_e32 v2, s8 ; G_GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen ; G_GFX10-NEXT: s_endpgm @@ -341,9 +341,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) ; G_GFX1030-LABEL: raw_ptr_buffer_atomic_max_noret_f64: ; G_GFX1030: ; %bb.0: ; %main_body ; G_GFX1030-NEXT: s_clause 0x2 -; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; G_GFX1030-NEXT: s_load_dword s6, s[2:3], 0x3c -; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; G_GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c +; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -424,7 +424,7 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex, ptr addrspace(3) %out) { ; SI-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -438,7 +438,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp ; ; GFX7-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -452,20 +452,20 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp ; ; GFX10-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 4 offen glc slc -; GFX10-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: v_mov_b32_e32 v1, s9 +; GFX10-NEXT: v_mov_b32_e32 v2, s10 +; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc +; GFX10-NEXT: v_mov_b32_e32 v2, s11 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ds_write_b64 v2, v[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX1030-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -478,7 +478,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp ; ; G_SI-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; G_SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; G_SI-NEXT: s_mov_b32 m0, -1 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v0, s4 @@ -492,7 +492,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp ; ; G_GFX7-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; G_GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; G_GFX7-NEXT: s_mov_b32 m0, -1 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -506,20 +506,20 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp ; ; G_GFX10-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; G_GFX10: ; %bb.0: ; %main_body -; G_GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; G_GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s4 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s5 -; G_GFX10-NEXT: v_mov_b32_e32 v2, s6 -; G_GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 4 offen glc slc -; G_GFX10-NEXT: v_mov_b32_e32 v2, s7 +; G_GFX10-NEXT: v_mov_b32_e32 v0, s8 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s9 +; G_GFX10-NEXT: v_mov_b32_e32 v2, s10 +; G_GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc +; G_GFX10-NEXT: v_mov_b32_e32 v2, s11 ; G_GFX10-NEXT: s_waitcnt vmcnt(0) ; G_GFX10-NEXT: ds_write_b64 v2, v[0:1] ; G_GFX10-NEXT: s_endpgm ; ; G_GFX1030-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; G_GFX1030: ; %bb.0: ; %main_body -; G_GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; G_GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll index 3571f3545ad1a..04ef30bd26aa5 100644 --- a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll +++ b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll @@ -8,8 +8,8 @@ declare float @llvm.fabs.f32(float) #1 define amdgpu_kernel void @fp_to_sint_i32(ptr addrspace(1) %out, float %in) { ; SI-LABEL: fp_to_sint_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -19,12 +19,12 @@ define amdgpu_kernel void @fp_to_sint_i32(ptr addrspace(1) %out, float %in) { ; ; VI-LABEL: fp_to_sint_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cvt_i32_f32_e32 v0, s4 +; VI-NEXT: v_cvt_i32_f32_e32 v0, s2 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -47,8 +47,8 @@ define amdgpu_kernel void @fp_to_sint_i32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @fp_to_sint_i32_fabs(ptr addrspace(1) %out, float %in) { ; SI-LABEL: fp_to_sint_i32_fabs: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -58,12 +58,12 @@ define amdgpu_kernel void @fp_to_sint_i32_fabs(ptr addrspace(1) %out, float %in) ; ; VI-LABEL: fp_to_sint_i32_fabs: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cvt_i32_f32_e64 v0, |s4| +; VI-NEXT: v_cvt_i32_f32_e64 v0, |s2| +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -87,7 +87,7 @@ define amdgpu_kernel void @fp_to_sint_i32_fabs(ptr addrspace(1) %out, float %in) define amdgpu_kernel void @fp_to_sint_v2i32(ptr addrspace(1) %out, <2 x float> %in) { ; SI-LABEL: fp_to_sint_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -100,7 +100,7 @@ define amdgpu_kernel void @fp_to_sint_v2i32(ptr addrspace(1) %out, <2 x float> % ; ; VI-LABEL: fp_to_sint_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -132,7 +132,7 @@ define amdgpu_kernel void @fp_to_sint_v2i32(ptr addrspace(1) %out, <2 x float> % define amdgpu_kernel void @fp_to_sint_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: fp_to_sint_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -147,7 +147,7 @@ define amdgpu_kernel void @fp_to_sint_v4i32(ptr addrspace(1) %out, ptr addrspace ; ; VI-LABEL: fp_to_sint_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; VI-NEXT: s_mov_b32 s3, 0xf000 @@ -193,37 +193,37 @@ define amdgpu_kernel void @fp_to_sint_v4i32(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @fp_to_sint_i64 (ptr addrspace(1) %out, float %in) { ; SI-LABEL: fp_to_sint_i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s5, 0x2f800000 -; SI-NEXT: s_mov_b32 s6, 0xcf800000 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s1, 0x2f800000 +; SI-NEXT: s_mov_b32 s2, 0xcf800000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_trunc_f32_e32 v0, s4 -; SI-NEXT: v_mul_f32_e64 v1, |v0|, s5 +; SI-NEXT: v_trunc_f32_e32 v0, s0 +; SI-NEXT: v_mul_f32_e64 v1, |v0|, s1 ; SI-NEXT: v_ashrrev_i32_e32 v2, 31, v0 ; SI-NEXT: v_floor_f32_e32 v1, v1 ; SI-NEXT: v_cvt_u32_f32_e32 v3, v1 -; SI-NEXT: v_fma_f32 v0, v1, s6, |v0| +; SI-NEXT: v_fma_f32 v0, v1, s2, |v0| ; SI-NEXT: v_cvt_u32_f32_e32 v0, v0 ; SI-NEXT: v_xor_b32_e32 v1, v3, v2 ; SI-NEXT: v_xor_b32_e32 v0, v0, v2 ; SI-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; SI-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fp_to_sint_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_mov_b32 s2, 0x2f800000 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s4, 0x2f800000 ; VI-NEXT: s_mov_b32 s5, 0xcf800000 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_trunc_f32_e32 v0, s4 -; VI-NEXT: v_mul_f32_e64 v1, |v0|, s2 +; VI-NEXT: v_trunc_f32_e32 v0, s2 +; VI-NEXT: v_mul_f32_e64 v1, |v0|, s4 ; VI-NEXT: v_floor_f32_e32 v1, v1 ; VI-NEXT: v_fma_f32 v2, v1, s5, |v0| ; VI-NEXT: v_cvt_u32_f32_e32 v2, v2 @@ -294,7 +294,7 @@ entry: define amdgpu_kernel void @fp_to_sint_v2i64(ptr addrspace(1) %out, <2 x float> %x) { ; SI-LABEL: fp_to_sint_v2i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s8, 0x2f800000 @@ -329,7 +329,7 @@ define amdgpu_kernel void @fp_to_sint_v2i64(ptr addrspace(1) %out, <2 x float> % ; ; VI-LABEL: fp_to_sint_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s8, 0x2f800000 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -452,17 +452,17 @@ define amdgpu_kernel void @fp_to_sint_v2i64(ptr addrspace(1) %out, <2 x float> % define amdgpu_kernel void @fp_to_sint_v4i64(ptr addrspace(1) %out, <4 x float> %x) { ; SI-LABEL: fp_to_sint_v4i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s8, 0x2f800000 ; SI-NEXT: s_mov_b32 s9, 0xcf800000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_trunc_f32_e32 v0, s5 -; SI-NEXT: v_trunc_f32_e32 v1, s4 -; SI-NEXT: v_trunc_f32_e32 v2, s7 -; SI-NEXT: v_trunc_f32_e32 v3, s6 +; SI-NEXT: v_trunc_f32_e32 v0, s1 +; SI-NEXT: v_trunc_f32_e32 v1, s0 +; SI-NEXT: v_trunc_f32_e32 v2, s3 +; SI-NEXT: v_trunc_f32_e32 v3, s2 ; SI-NEXT: v_mul_f32_e64 v4, |v0|, s8 ; SI-NEXT: v_ashrrev_i32_e32 v5, 31, v0 ; SI-NEXT: v_mul_f32_e64 v6, |v1|, s8 @@ -503,14 +503,14 @@ define amdgpu_kernel void @fp_to_sint_v4i64(ptr addrspace(1) %out, <4 x float> % ; SI-NEXT: v_subb_u32_e32 v7, vcc, v12, v9, vcc ; SI-NEXT: v_sub_i32_e32 v4, vcc, v13, v11 ; SI-NEXT: v_subb_u32_e32 v5, vcc, v8, v11, vcc -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fp_to_sint_v4i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s8, 0x2f800000 ; VI-NEXT: s_mov_b32 s9, 0xcf800000 ; VI-NEXT: s_mov_b32 s3, 0xf000 @@ -737,8 +737,8 @@ define amdgpu_kernel void @fp_to_sint_v4i64(ptr addrspace(1) %out, <4 x float> % define amdgpu_kernel void @fp_to_uint_f32_to_i1(ptr addrspace(1) %out, float %in) #0 { ; SI-LABEL: fp_to_uint_f32_to_i1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -749,8 +749,8 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i1(ptr addrspace(1) %out, float %in ; ; VI-LABEL: fp_to_uint_f32_to_i1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -787,8 +787,8 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i1(ptr addrspace(1) %out, float %in define amdgpu_kernel void @fp_to_uint_fabs_f32_to_i1(ptr addrspace(1) %out, float %in) #0 { ; SI-LABEL: fp_to_uint_fabs_f32_to_i1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -799,8 +799,8 @@ define amdgpu_kernel void @fp_to_uint_fabs_f32_to_i1(ptr addrspace(1) %out, floa ; ; VI-LABEL: fp_to_uint_fabs_f32_to_i1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -838,8 +838,8 @@ define amdgpu_kernel void @fp_to_uint_fabs_f32_to_i1(ptr addrspace(1) %out, floa define amdgpu_kernel void @fp_to_sint_f32_i16(ptr addrspace(1) %out, float %in) #0 { ; SI-LABEL: fp_to_sint_f32_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -849,12 +849,12 @@ define amdgpu_kernel void @fp_to_sint_f32_i16(ptr addrspace(1) %out, float %in) ; ; VI-LABEL: fp_to_sint_f32_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cvt_i32_f32_e32 v0, s4 +; VI-NEXT: v_cvt_i32_f32_e32 v0, s2 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll index c6b4e129bacbe..5abf82aa1aab5 100644 --- a/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll +++ b/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll @@ -8,8 +8,8 @@ declare float @llvm.fabs.f32(float) #1 define amdgpu_kernel void @fp_to_uint_f32_to_i32 (ptr addrspace(1) %out, float %in) { ; SI-LABEL: fp_to_uint_f32_to_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -19,12 +19,12 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i32 (ptr addrspace(1) %out, float % ; ; VI-LABEL: fp_to_uint_f32_to_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cvt_u32_f32_e32 v0, s4 +; VI-NEXT: v_cvt_u32_f32_e32 v0, s2 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -47,7 +47,7 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i32 (ptr addrspace(1) %out, float % define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i32(ptr addrspace(1) %out, <2 x float> %in) { ; SI-LABEL: fp_to_uint_v2f32_to_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -60,7 +60,7 @@ define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i32(ptr addrspace(1) %out, <2 x ; ; VI-LABEL: fp_to_uint_v2f32_to_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -92,7 +92,7 @@ define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i32(ptr addrspace(1) %out, <2 x define amdgpu_kernel void @fp_to_uint_v4f32_to_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: fp_to_uint_v4f32_to_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -107,7 +107,7 @@ define amdgpu_kernel void @fp_to_uint_v4f32_to_v4i32(ptr addrspace(1) %out, ptr ; ; VI-LABEL: fp_to_uint_v4f32_to_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; VI-NEXT: s_mov_b32 s3, 0xf000 @@ -152,34 +152,34 @@ define amdgpu_kernel void @fp_to_uint_v4f32_to_v4i32(ptr addrspace(1) %out, ptr define amdgpu_kernel void @fp_to_uint_f32_to_i64(ptr addrspace(1) %out, float %x) { ; SI-LABEL: fp_to_uint_f32_to_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s5, 0xcf800000 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s1, 0xcf800000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_trunc_f32_e32 v0, s4 +; SI-NEXT: v_trunc_f32_e32 v0, s0 ; SI-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; SI-NEXT: v_floor_f32_e32 v2, v1 ; SI-NEXT: v_cvt_u32_f32_e32 v1, v2 -; SI-NEXT: v_fma_f32 v0, v2, s5, v0 +; SI-NEXT: v_fma_f32 v0, v2, s1, v0 ; SI-NEXT: v_cvt_u32_f32_e32 v0, v0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fp_to_uint_f32_to_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_mov_b32 s2, 0xcf800000 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xcf800000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_trunc_f32_e32 v0, s4 +; VI-NEXT: v_trunc_f32_e32 v0, s2 ; VI-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; VI-NEXT: v_floor_f32_e32 v2, v1 -; VI-NEXT: v_fma_f32 v0, v2, s2, v0 +; VI-NEXT: v_fma_f32 v0, v2, s3, v0 ; VI-NEXT: v_cvt_u32_f32_e32 v1, v2 ; VI-NEXT: v_cvt_u32_f32_e32 v0, v0 +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm @@ -240,7 +240,7 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i64(ptr addrspace(1) %out, float %x define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i64(ptr addrspace(1) %out, <2 x float> %x) { ; SI-LABEL: fp_to_uint_v2f32_to_v2i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s8, 0xcf800000 @@ -264,7 +264,7 @@ define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i64(ptr addrspace(1) %out, <2 x ; ; VI-LABEL: fp_to_uint_v2f32_to_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -376,16 +376,16 @@ define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i64(ptr addrspace(1) %out, <2 x define amdgpu_kernel void @fp_to_uint_v4f32_to_v4i64(ptr addrspace(1) %out, <4 x float> %x) { ; SI-LABEL: fp_to_uint_v4f32_to_v4i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s8, 0xcf800000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_trunc_f32_e32 v0, s5 -; SI-NEXT: v_trunc_f32_e32 v2, s4 -; SI-NEXT: v_trunc_f32_e32 v4, s7 -; SI-NEXT: v_trunc_f32_e32 v6, s6 +; SI-NEXT: v_trunc_f32_e32 v0, s1 +; SI-NEXT: v_trunc_f32_e32 v2, s0 +; SI-NEXT: v_trunc_f32_e32 v4, s3 +; SI-NEXT: v_trunc_f32_e32 v6, s2 ; SI-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; SI-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; SI-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 @@ -406,14 +406,14 @@ define amdgpu_kernel void @fp_to_uint_v4f32_to_v4i64(ptr addrspace(1) %out, <4 x ; SI-NEXT: v_cvt_u32_f32_e32 v0, v8 ; SI-NEXT: v_cvt_u32_f32_e32 v6, v4 ; SI-NEXT: v_cvt_u32_f32_e32 v4, v9 -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fp_to_uint_v4f32_to_v4i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s2, 0xcf800000 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -619,8 +619,8 @@ define amdgpu_kernel void @fp_to_uint_v4f32_to_v4i64(ptr addrspace(1) %out, <4 x define amdgpu_kernel void @fp_to_uint_f32_to_i1(ptr addrspace(1) %out, float %in) #0 { ; SI-LABEL: fp_to_uint_f32_to_i1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -631,8 +631,8 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i1(ptr addrspace(1) %out, float %in ; ; VI-LABEL: fp_to_uint_f32_to_i1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -669,8 +669,8 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i1(ptr addrspace(1) %out, float %in define amdgpu_kernel void @fp_to_uint_fabs_f32_to_i1(ptr addrspace(1) %out, float %in) #0 { ; SI-LABEL: fp_to_uint_fabs_f32_to_i1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -681,8 +681,8 @@ define amdgpu_kernel void @fp_to_uint_fabs_f32_to_i1(ptr addrspace(1) %out, floa ; ; VI-LABEL: fp_to_uint_fabs_f32_to_i1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -720,8 +720,8 @@ define amdgpu_kernel void @fp_to_uint_fabs_f32_to_i1(ptr addrspace(1) %out, floa define amdgpu_kernel void @fp_to_uint_f32_to_i16(ptr addrspace(1) %out, float %in) #0 { ; SI-LABEL: fp_to_uint_f32_to_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -731,12 +731,12 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i16(ptr addrspace(1) %out, float %i ; ; VI-LABEL: fp_to_uint_f32_to_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cvt_u32_f32_e32 v0, s4 +; VI-NEXT: v_cvt_u32_f32_e32 v0, s2 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/fshl.ll b/llvm/test/CodeGen/AMDGPU/fshl.ll index ea588df86b846..3c4087fe391b6 100644 --- a/llvm/test/CodeGen/AMDGPU/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/fshl.ll @@ -13,49 +13,51 @@ declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) nounwind rea define amdgpu_kernel void @fshl_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z) { ; SI-LABEL: fshl_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s5 -; SI-NEXT: s_lshr_b32 s5, s4, 1 -; SI-NEXT: v_alignbit_b32 v0, s4, v0, 1 -; SI-NEXT: s_not_b32 s4, s6 -; SI-NEXT: v_mov_b32_e32 v1, s4 -; SI-NEXT: v_alignbit_b32 v0, s5, v0, v1 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: v_mov_b32_e32 v0, s7 +; SI-NEXT: s_not_b32 s5, s8 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: v_alignbit_b32 v0, s6, v0, 1 +; SI-NEXT: s_lshr_b32 s4, s6, 1 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_alignbit_b32 v0, s4, v0, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshl_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s5 -; VI-NEXT: s_not_b32 s3, s6 -; VI-NEXT: s_lshr_b32 s2, s4, 1 -; VI-NEXT: v_alignbit_b32 v0, s4, v0, 1 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_alignbit_b32 v2, s2, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: s_not_b32 s0, s0 +; VI-NEXT: s_lshr_b32 s1, s6, 1 +; VI-NEXT: v_alignbit_b32 v0, s6, v0, 1 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_alignbit_b32 v2, s1, v0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshl_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: s_not_b32 s3, s6 -; GFX9-NEXT: s_lshr_b32 s2, s4, 1 -; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 1 -; GFX9-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NEXT: v_alignbit_b32 v1, s2, v1, v2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_not_b32 s1, s2 +; GFX9-NEXT: s_lshr_b32 s0, s6, 1 +; GFX9-NEXT: v_alignbit_b32 v1, s6, v1, 1 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_alignbit_b32 v1, s0, v1, v2 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshl_i32: @@ -75,30 +77,30 @@ define amdgpu_kernel void @fshl_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z ; GFX10-LABEL: fshl_i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v0, s4, s5, 1 -; GFX10-NEXT: s_lshr_b32 s2, s4, 1 -; GFX10-NEXT: s_not_b32 s3, s6 -; GFX10-NEXT: v_alignbit_b32 v0, s2, v0, s3 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_alignbit_b32 v0, s6, s7, 1 +; GFX10-NEXT: s_lshr_b32 s0, s6, 1 +; GFX10-NEXT: s_not_b32 s1, s2 +; GFX10-NEXT: v_alignbit_b32 v0, s0, v0, s1 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshl_i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v0, s4, s5, 1 -; GFX11-NEXT: s_lshr_b32 s2, s4, 1 -; GFX11-NEXT: s_not_b32 s3, s6 +; GFX11-NEXT: v_alignbit_b32 v0, s6, s7, 1 +; GFX11-NEXT: s_lshr_b32 s1, s6, 1 +; GFX11-NEXT: s_not_b32 s0, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_alignbit_b32 v0, s2, v0, s3 -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: v_alignbit_b32 v0, s1, v0, s0 +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -111,7 +113,7 @@ entry: define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; SI-LABEL: fshl_i32_imm: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -124,7 +126,7 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; VI-LABEL: fshl_i32_imm: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s3 ; VI-NEXT: v_alignbit_b32 v2, s2, v0, 25 @@ -135,12 +137,12 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX9-LABEL: fshl_i32_imm: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_alignbit_b32 v1, s2, v1, 25 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_alignbit_b32 v1, s6, v1, 25 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshl_i32_imm: @@ -157,16 +159,16 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX10-LABEL: fshl_i32_imm: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v1, s2, s3, 25 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: v_alignbit_b32 v1, s6, s7, 25 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshl_i32_imm: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_alignbit_b32 v1, s2, s3, 25 @@ -183,15 +185,15 @@ entry: define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) { ; SI-LABEL: fshl_v2i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xf ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s7 -; SI-NEXT: s_not_b32 s1, s1 ; SI-NEXT: v_alignbit_b32 v0, s5, v0, 1 +; SI-NEXT: s_not_b32 s1, s1 ; SI-NEXT: s_lshr_b32 s2, s5, 1 ; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: v_alignbit_b32 v1, s2, v0, v1 @@ -206,47 +208,47 @@ define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; ; VI-LABEL: fshl_v2i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s7 -; VI-NEXT: s_not_b32 s1, s1 +; VI-NEXT: s_not_b32 s3, s3 ; VI-NEXT: s_lshr_b32 s7, s5, 1 ; VI-NEXT: v_alignbit_b32 v0, s5, v0, 1 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_alignbit_b32 v1, s7, v0, v1 ; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: s_not_b32 s0, s0 +; VI-NEXT: s_not_b32 s2, s2 ; VI-NEXT: v_alignbit_b32 v0, s4, v0, 1 -; VI-NEXT: s_lshr_b32 s1, s4, 1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_alignbit_b32 v0, s1, v0, v2 +; VI-NEXT: s_lshr_b32 s3, s4, 1 ; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_alignbit_b32 v0, s3, v0, v2 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshl_v2i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x3c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: s_lshr_b32 s2, s5, 1 -; GFX9-NEXT: s_not_b32 s3, s9 +; GFX9-NEXT: s_lshr_b32 s0, s5, 1 +; GFX9-NEXT: s_not_b32 s1, s9 ; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 1 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_alignbit_b32 v1, s2, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_alignbit_b32 v1, s0, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_not_b32 s3, s8 +; GFX9-NEXT: s_not_b32 s1, s8 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 1 -; GFX9-NEXT: s_lshr_b32 s2, s4, 1 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_alignbit_b32 v0, s2, v0, v3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_lshr_b32 s0, s4, 1 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshl_v2i32: @@ -270,39 +272,39 @@ define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX10-LABEL: fshl_v2i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX10-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_alignbit_b32 v0, s5, s7, 1 ; GFX10-NEXT: v_alignbit_b32 v3, s4, s6, 1 -; GFX10-NEXT: s_lshr_b32 s2, s5, 1 -; GFX10-NEXT: s_not_b32 s1, s1 +; GFX10-NEXT: s_lshr_b32 s0, s5, 1 +; GFX10-NEXT: s_not_b32 s1, s3 ; GFX10-NEXT: s_lshr_b32 s3, s4, 1 -; GFX10-NEXT: s_not_b32 s0, s0 -; GFX10-NEXT: v_alignbit_b32 v1, s2, v0, s1 -; GFX10-NEXT: v_alignbit_b32 v0, s3, v3, s0 +; GFX10-NEXT: s_not_b32 s2, s2 +; GFX10-NEXT: v_alignbit_b32 v1, s0, v0, s1 +; GFX10-NEXT: v_alignbit_b32 v0, s3, v3, s2 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshl_v2i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x3c -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x3c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_alignbit_b32 v0, s5, s7, 1 ; GFX11-NEXT: v_alignbit_b32 v3, s4, s6, 1 ; GFX11-NEXT: s_lshr_b32 s5, s5, 1 -; GFX11-NEXT: s_not_b32 s1, s1 +; GFX11-NEXT: s_not_b32 s3, s3 ; GFX11-NEXT: s_lshr_b32 s4, s4, 1 -; GFX11-NEXT: s_not_b32 s0, s0 -; GFX11-NEXT: v_alignbit_b32 v1, s5, v0, s1 -; GFX11-NEXT: v_alignbit_b32 v0, s4, v3, s0 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-NEXT: s_not_b32 s2, s2 +; GFX11-NEXT: v_alignbit_b32 v1, s5, v0, s3 +; GFX11-NEXT: v_alignbit_b32 v0, s4, v3, s2 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -315,8 +317,8 @@ entry: define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y) { ; SI-LABEL: fshl_v2i32_imm: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -329,8 +331,8 @@ define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; ; VI-LABEL: fshl_v2i32_imm: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s7 ; VI-NEXT: v_mov_b32_e32 v2, s6 @@ -343,15 +345,15 @@ define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; ; GFX9-LABEL: fshl_v2i32_imm: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s7 ; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 23 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v3, 25 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshl_v2i32_imm: @@ -371,20 +373,20 @@ define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; GFX10-LABEL: fshl_v2i32_imm: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_alignbit_b32 v1, s5, s7, 23 ; GFX10-NEXT: v_alignbit_b32 v0, s4, s6, 25 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshl_v2i32_imm: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_alignbit_b32 v1, s5, s7, 23 @@ -402,44 +404,44 @@ entry: define amdgpu_kernel void @fshl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; SI-LABEL: fshl_v4i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x15 -; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s15, 0xf000 -; SI-NEXT: s_mov_b32 s14, -1 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x15 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_not_b32 s1, s19 ; SI-NEXT: v_mov_b32_e32 v0, s11 +; SI-NEXT: s_not_b32 s11, s15 ; SI-NEXT: v_alignbit_b32 v0, s7, v0, 1 -; SI-NEXT: s_lshr_b32 s0, s7, 1 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: v_alignbit_b32 v3, s0, v0, v1 +; SI-NEXT: s_lshr_b32 s7, s7, 1 +; SI-NEXT: v_mov_b32_e32 v1, s11 +; SI-NEXT: v_alignbit_b32 v3, s7, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s10 -; SI-NEXT: s_not_b32 s1, s18 +; SI-NEXT: s_not_b32 s7, s14 ; SI-NEXT: v_alignbit_b32 v0, s6, v0, 1 -; SI-NEXT: s_lshr_b32 s0, s6, 1 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: v_alignbit_b32 v2, s0, v0, v1 +; SI-NEXT: s_lshr_b32 s6, s6, 1 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_alignbit_b32 v2, s6, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s9 -; SI-NEXT: s_not_b32 s1, s17 +; SI-NEXT: s_not_b32 s6, s13 ; SI-NEXT: v_alignbit_b32 v0, s5, v0, 1 -; SI-NEXT: s_lshr_b32 s0, s5, 1 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: v_alignbit_b32 v1, s0, v0, v1 +; SI-NEXT: s_lshr_b32 s5, s5, 1 +; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: v_alignbit_b32 v1, s5, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: s_not_b32 s1, s16 +; SI-NEXT: s_not_b32 s5, s12 ; SI-NEXT: v_alignbit_b32 v0, s4, v0, 1 -; SI-NEXT: s_lshr_b32 s0, s4, 1 -; SI-NEXT: v_mov_b32_e32 v4, s1 -; SI-NEXT: v_alignbit_b32 v0, s0, v0, v4 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; SI-NEXT: s_lshr_b32 s4, s4, 1 +; SI-NEXT: v_mov_b32_e32 v4, s5 +; SI-NEXT: v_alignbit_b32 v0, s4, v0, v4 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshl_v4i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x54 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s11 ; VI-NEXT: s_not_b32 s3, s15 @@ -472,36 +474,36 @@ define amdgpu_kernel void @fshl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; ; GFX9-LABEL: fshl_v4i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x54 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_not_b32 s3, s15 +; GFX9-NEXT: s_not_b32 s1, s15 ; GFX9-NEXT: v_mov_b32_e32 v0, s11 -; GFX9-NEXT: s_lshr_b32 s2, s7, 1 +; GFX9-NEXT: s_lshr_b32 s0, s7, 1 ; GFX9-NEXT: v_alignbit_b32 v0, s7, v0, 1 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_alignbit_b32 v3, s2, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_alignbit_b32 v3, s0, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s10 -; GFX9-NEXT: s_not_b32 s3, s14 +; GFX9-NEXT: s_not_b32 s1, s14 ; GFX9-NEXT: v_alignbit_b32 v0, s6, v0, 1 -; GFX9-NEXT: s_lshr_b32 s2, s6, 1 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_alignbit_b32 v2, s2, v0, v1 +; GFX9-NEXT: s_lshr_b32 s0, s6, 1 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_alignbit_b32 v2, s0, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s9 -; GFX9-NEXT: s_not_b32 s3, s13 +; GFX9-NEXT: s_not_b32 s1, s13 ; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 1 -; GFX9-NEXT: s_lshr_b32 s2, s5, 1 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_alignbit_b32 v1, s2, v0, v1 +; GFX9-NEXT: s_lshr_b32 s0, s5, 1 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_alignbit_b32 v1, s0, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: s_not_b32 s3, s12 +; GFX9-NEXT: s_not_b32 s1, s12 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 1 -; GFX9-NEXT: s_lshr_b32 s2, s4, 1 -; GFX9-NEXT: v_mov_b32_e32 v5, s3 -; GFX9-NEXT: v_alignbit_b32 v0, s2, v0, v5 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX9-NEXT: s_lshr_b32 s0, s4, 1 +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v5 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshl_v4i32: @@ -532,11 +534,11 @@ define amdgpu_kernel void @fshl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; ; GFX10-LABEL: fshl_v4i32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x54 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_alignbit_b32 v0, s7, s11, 1 ; GFX10-NEXT: v_alignbit_b32 v1, s6, s10, 1 @@ -560,9 +562,9 @@ define amdgpu_kernel void @fshl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX11-LABEL: fshl_v4i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x34 -; GFX11-NEXT: s_load_b128 s[12:15], s[2:3], 0x54 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[12:15], s[0:1], 0x54 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_alignbit_b32 v0, s7, s11, 1 @@ -594,10 +596,10 @@ entry: define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 x i32> %y) { ; SI-LABEL: fshl_v4i32_imm: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s15, 0xf000 -; SI-NEXT: s_mov_b32 s14, -1 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s11 ; SI-NEXT: v_mov_b32_e32 v1, s10 @@ -607,13 +609,13 @@ define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; SI-NEXT: v_alignbit_b32 v1, s5, v0, 25 ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_alignbit_b32 v0, s4, v0, 31 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshl_v4i32_imm: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s11 ; VI-NEXT: v_mov_b32_e32 v1, s10 @@ -630,9 +632,9 @@ define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; ; GFX9-LABEL: fshl_v4i32_imm: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s11 ; GFX9-NEXT: v_mov_b32_e32 v1, s10 @@ -666,22 +668,22 @@ define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; GFX10-LABEL: fshl_v4i32_imm: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_alignbit_b32 v3, s7, s11, 31 ; GFX10-NEXT: v_alignbit_b32 v2, s6, s10, 23 ; GFX10-NEXT: v_alignbit_b32 v1, s5, s9, 25 ; GFX10-NEXT: v_alignbit_b32 v0, s4, s8, 31 -; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshl_v4i32_imm: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_alignbit_b32 v3, s7, s11, 31 @@ -702,7 +704,7 @@ entry: define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) { ; SI-LABEL: orxor2or1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -718,7 +720,7 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) { ; ; VI-LABEL: orxor2or1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b32 s4, s2, 7 ; VI-NEXT: s_or_b32 s4, s3, s4 @@ -732,15 +734,15 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) { ; ; GFX9-LABEL: orxor2or1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s4, s2, 7 -; GFX9-NEXT: s_or_b32 s4, s3, s4 -; GFX9-NEXT: s_cmp_eq_u32 s4, 0 -; GFX9-NEXT: s_cselect_b32 s2, s2, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_lshl_b32 s0, s6, 7 +; GFX9-NEXT: s_or_b32 s0, s7, s0 +; GFX9-NEXT: s_cmp_eq_u32 s0, 0 +; GFX9-NEXT: s_cselect_b32 s0, s6, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: orxor2or1: @@ -759,20 +761,20 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) { ; ; GFX10-LABEL: orxor2or1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_lshl_b32 s4, s2, 7 -; GFX10-NEXT: s_or_b32 s4, s3, s4 -; GFX10-NEXT: s_cmp_eq_u32 s4, 0 -; GFX10-NEXT: s_cselect_b32 s2, s2, s3 -; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_lshl_b32 s0, s6, 7 +; GFX10-NEXT: s_or_b32 s0, s7, s0 +; GFX10-NEXT: s_cmp_eq_u32 s0, 0 +; GFX10-NEXT: s_cselect_b32 s0, s6, s7 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: orxor2or1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshl_b32 s4, s2, 7 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll index dbcebe6e07e3f..e8377763e4be2 100644 --- a/llvm/test/CodeGen/AMDGPU/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/fshr.ll @@ -22,40 +22,42 @@ declare <2 x i24> @llvm.fshr.v2i24(<2 x i24>, <2 x i24>, <2 x i24>) define amdgpu_kernel void @fshr_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z) { ; SI-LABEL: fshr_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s5 -; SI-NEXT: v_mov_b32_e32 v1, s6 -; SI-NEXT: v_alignbit_b32 v0, s4, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s7 +; SI-NEXT: v_mov_b32_e32 v1, s8 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: v_alignbit_b32 v0, s6, v0, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshr_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s5 -; VI-NEXT: v_mov_b32_e32 v1, s6 -; VI-NEXT: v_alignbit_b32 v2, s4, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_alignbit_b32 v2, s6, v0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshr_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, v2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_alignbit_b32 v1, s6, v1, v2 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshr_i32: @@ -72,24 +74,24 @@ define amdgpu_kernel void @fshr_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z ; GFX10-LABEL: fshr_i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_alignbit_b32 v0, s4, s5, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_alignbit_b32 v0, s6, s7, v0 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshr_i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_alignbit_b32 v0, s4, s5, v0 +; GFX11-NEXT: v_alignbit_b32 v0, s2, s3, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -103,7 +105,7 @@ entry: define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; SI-LABEL: fshr_i32_imm: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -116,7 +118,7 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; VI-LABEL: fshr_i32_imm: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s3 ; VI-NEXT: v_alignbit_b32 v2, s2, v0, 7 @@ -127,12 +129,12 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX9-LABEL: fshr_i32_imm: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_alignbit_b32 v1, s2, v1, 7 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_alignbit_b32 v1, s6, v1, 7 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshr_i32_imm: @@ -149,16 +151,16 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX10-LABEL: fshr_i32_imm: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v1, s2, s3, 7 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: v_alignbit_b32 v1, s6, s7, 7 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshr_i32_imm: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_alignbit_b32 v1, s2, s3, 7 @@ -175,9 +177,9 @@ entry: define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) { ; SI-LABEL: fshr_v2i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -192,33 +194,33 @@ define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; ; VI-LABEL: fshr_v2i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s7 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_alignbit_b32 v1, s5, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_alignbit_b32 v0, s4, v2, v0 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshr_v2i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, v3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX9-NEXT: s_endpgm @@ -240,13 +242,13 @@ define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX10-LABEL: fshr_v2i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s1 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: v_mov_b32_e32 v0, s3 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-NEXT: v_alignbit_b32 v1, s5, s7, v0 ; GFX10-NEXT: v_alignbit_b32 v0, s4, s6, v2 ; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[8:9] @@ -255,16 +257,16 @@ define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX11-LABEL: fshr_v2i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x3c -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x3c +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s1 -; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_alignbit_b32 v1, s5, s7, v0 ; GFX11-NEXT: v_alignbit_b32 v0, s4, s6, v2 -; GFX11-NEXT: global_store_b64 v3, v[0:1], s[2:3] +; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -277,8 +279,8 @@ entry: define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y) { ; SI-LABEL: fshr_v2i32_imm: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -291,8 +293,8 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; ; VI-LABEL: fshr_v2i32_imm: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s7 ; VI-NEXT: v_mov_b32_e32 v2, s6 @@ -305,15 +307,15 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; ; GFX9-LABEL: fshr_v2i32_imm: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s7 ; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 9 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v3, 7 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshr_v2i32_imm: @@ -333,20 +335,20 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; GFX10-LABEL: fshr_v2i32_imm: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_alignbit_b32 v1, s5, s7, 9 ; GFX10-NEXT: v_alignbit_b32 v0, s4, s6, 7 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshr_v2i32_imm: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_alignbit_b32 v1, s5, s7, 9 @@ -364,11 +366,11 @@ entry: define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; SI-LABEL: fshr_v4i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x15 -; SI-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s19, 0xf000 -; SI-NEXT: s_mov_b32 s18, -1 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x15 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s11 ; SI-NEXT: v_mov_b32_e32 v1, s15 @@ -382,14 +384,14 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v4, s12 ; SI-NEXT: v_alignbit_b32 v0, s4, v0, v4 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshr_v4i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x54 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s11 ; VI-NEXT: v_mov_b32_e32 v1, s15 @@ -410,10 +412,10 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; ; GFX9-LABEL: fshr_v4i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x54 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s11 ; GFX9-NEXT: v_mov_b32_e32 v1, s15 @@ -451,9 +453,9 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX10-LABEL: fshr_v4i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x54 -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s15 @@ -464,15 +466,15 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX10-NEXT: v_alignbit_b32 v2, s6, s10, v1 ; GFX10-NEXT: v_alignbit_b32 v1, s5, s9, v4 ; GFX10-NEXT: v_alignbit_b32 v0, s4, s8, v5 -; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshr_v4i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b128 s[12:15], s[2:3], 0x54 -; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[12:15], s[0:1], 0x54 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v6, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s15 :: v_dual_mov_b32 v1, s14 @@ -496,10 +498,10 @@ entry: define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 x i32> %y) { ; SI-LABEL: fshr_v4i32_imm: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s15, 0xf000 -; SI-NEXT: s_mov_b32 s14, -1 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s11 ; SI-NEXT: v_mov_b32_e32 v1, s10 @@ -509,13 +511,13 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; SI-NEXT: v_alignbit_b32 v1, s5, v0, 7 ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_alignbit_b32 v0, s4, v0, 1 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshr_v4i32_imm: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s11 ; VI-NEXT: v_mov_b32_e32 v1, s10 @@ -532,9 +534,9 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; ; GFX9-LABEL: fshr_v4i32_imm: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s11 ; GFX9-NEXT: v_mov_b32_e32 v1, s10 @@ -566,22 +568,22 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; GFX10-LABEL: fshr_v4i32_imm: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_alignbit_b32 v3, s7, s11, 1 ; GFX10-NEXT: v_alignbit_b32 v2, s6, s10, 9 ; GFX10-NEXT: v_alignbit_b32 v1, s5, s9, 7 ; GFX10-NEXT: v_alignbit_b32 v0, s4, s8, 1 -; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshr_v4i32_imm: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_alignbit_b32 v3, s7, s11, 1 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics.ll b/llvm/test/CodeGen/AMDGPU/global_atomics.ll index 9bee539b1e4e5..d4398e5367c7f 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics.ll @@ -6,8 +6,8 @@ define amdgpu_kernel void @atomic_add_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_add_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -19,8 +19,8 @@ define amdgpu_kernel void @atomic_add_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; VI-LABEL: atomic_add_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -32,12 +32,12 @@ define amdgpu_kernel void @atomic_add_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; GFX9-LABEL: atomic_add_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_add v0, v1, s[0:1] offset:16 +; GFX9-NEXT: global_atomic_add v0, v1, s[2:3] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -50,8 +50,8 @@ entry: define amdgpu_kernel void @atomic_add_i32_max_neg_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_add_i32_max_neg_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_mov_b32_e32 v0, 0xfffff000 @@ -65,14 +65,14 @@ define amdgpu_kernel void @atomic_add_i32_max_neg_offset(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_add_i32_max_neg_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s0, 0xfffff000 -; VI-NEXT: s_addc_u32 s1, s1, -1 +; VI-NEXT: s_add_u32 s0, s2, 0xfffff000 +; VI-NEXT: s_addc_u32 s1, s3, -1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: flat_atomic_add v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -80,12 +80,12 @@ define amdgpu_kernel void @atomic_add_i32_max_neg_offset(ptr addrspace(1) %out, ; ; GFX9-LABEL: atomic_add_i32_max_neg_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_add v0, v1, s[0:1] offset:-4096 +; GFX9-NEXT: global_atomic_add v0, v1, s[2:3] offset:-4096 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -98,8 +98,8 @@ entry: define amdgpu_kernel void @atomic_add_i32_soffset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_add_i32_soffset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s5, 0x8ca0 @@ -112,8 +112,8 @@ define amdgpu_kernel void @atomic_add_i32_soffset(ptr addrspace(1) %out, i32 %in ; ; VI-LABEL: atomic_add_i32_soffset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s5, 0x8ca0 @@ -126,12 +126,12 @@ define amdgpu_kernel void @atomic_add_i32_soffset(ptr addrspace(1) %out, i32 %in ; ; GFX9-LABEL: atomic_add_i32_soffset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x8000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_add v0, v1, s[0:1] offset:3232 +; GFX9-NEXT: global_atomic_add v0, v1, s[2:3] offset:3232 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -144,29 +144,29 @@ entry: define amdgpu_kernel void @atomic_add_i32_huge_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_add_i32_huge_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_mov_b32_e32 v0, 0xdeac ; SI-NEXT: v_mov_b32_e32 v1, 0xabcd ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s4 -; SI-NEXT: buffer_atomic_add v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: v_mov_b32_e32 v2, s0 +; SI-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_add_i32_huge_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s0, 0xdeac -; VI-NEXT: s_addc_u32 s1, s1, 0xabcd +; VI-NEXT: s_add_u32 s0, s2, 0xdeac +; VI-NEXT: s_addc_u32 s1, s3, 0xabcd ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: flat_atomic_add v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -174,12 +174,12 @@ define amdgpu_kernel void @atomic_add_i32_huge_offset(ptr addrspace(1) %out, i32 ; ; GFX9-LABEL: atomic_add_i32_huge_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_u32 s0, s0, 0xdeac -; GFX9-NEXT: s_addc_u32 s1, s1, 0xabcd +; GFX9-NEXT: s_add_u32 s0, s2, 0xdeac +; GFX9-NEXT: s_addc_u32 s1, s3, 0xabcd ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: global_atomic_add v0, v1, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -195,8 +195,8 @@ entry: define amdgpu_kernel void @atomic_add_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_add_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -213,29 +213,29 @@ define amdgpu_kernel void @atomic_add_i32_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_add_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_mov_b32 s0, s6 +; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: buffer_atomic_add v0, off, s[4:7], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_add_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_add v1, v0, v1, s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -251,9 +251,9 @@ entry: define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_add_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -268,18 +268,18 @@ define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_add_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_atomic_add v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -287,12 +287,12 @@ define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_add_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -310,9 +310,9 @@ entry: define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_add_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s2, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -332,22 +332,22 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_add_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_add v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -356,12 +356,12 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_add_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -381,8 +381,8 @@ entry: define amdgpu_kernel void @atomic_add_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_add_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -394,8 +394,8 @@ define amdgpu_kernel void @atomic_add_i32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: atomic_add_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -407,12 +407,12 @@ define amdgpu_kernel void @atomic_add_i32(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: atomic_add_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_add v0, v1, s[0:1] +; GFX9-NEXT: global_atomic_add v0, v1, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -424,8 +424,8 @@ entry: define amdgpu_kernel void @atomic_add_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_add_i32_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -442,8 +442,8 @@ define amdgpu_kernel void @atomic_add_i32_ret(ptr addrspace(1) %out, ptr addrspa ; ; VI-LABEL: atomic_add_i32_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s8, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -460,11 +460,11 @@ define amdgpu_kernel void @atomic_add_i32_ret(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: atomic_add_i32_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_add v1, v0, v1, s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -479,9 +479,9 @@ entry: define amdgpu_kernel void @atomic_add_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_add_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -496,16 +496,16 @@ define amdgpu_kernel void @atomic_add_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; VI-LABEL: atomic_add_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_atomic_add v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -513,12 +513,12 @@ define amdgpu_kernel void @atomic_add_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; GFX9-LABEL: atomic_add_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -535,9 +535,9 @@ entry: define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_add_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s2, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -557,20 +557,20 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_add_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_add v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -579,12 +579,12 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_add_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -603,8 +603,8 @@ entry: define amdgpu_kernel void @atomic_and_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_and_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -616,8 +616,8 @@ define amdgpu_kernel void @atomic_and_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; VI-LABEL: atomic_and_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -629,12 +629,12 @@ define amdgpu_kernel void @atomic_and_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; GFX9-LABEL: atomic_and_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_and v0, v1, s[0:1] offset:16 +; GFX9-NEXT: global_atomic_and v0, v1, s[2:3] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -647,8 +647,8 @@ entry: define amdgpu_kernel void @atomic_and_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_and_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -665,29 +665,29 @@ define amdgpu_kernel void @atomic_and_i32_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_and_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_mov_b32 s0, s6 +; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: buffer_atomic_and v0, off, s[4:7], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_and_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_and v1, v0, v1, s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -703,9 +703,9 @@ entry: define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_and_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -720,18 +720,18 @@ define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_and_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_atomic_and v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -739,12 +739,12 @@ define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_and_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -762,9 +762,9 @@ entry: define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_and_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s2, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -784,22 +784,22 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_and_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_and v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -808,12 +808,12 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_and_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -833,8 +833,8 @@ entry: define amdgpu_kernel void @atomic_and_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_and_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -846,8 +846,8 @@ define amdgpu_kernel void @atomic_and_i32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: atomic_and_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -859,12 +859,12 @@ define amdgpu_kernel void @atomic_and_i32(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: atomic_and_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_and v0, v1, s[0:1] +; GFX9-NEXT: global_atomic_and v0, v1, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -876,8 +876,8 @@ entry: define amdgpu_kernel void @atomic_and_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_and_i32_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -894,8 +894,8 @@ define amdgpu_kernel void @atomic_and_i32_ret(ptr addrspace(1) %out, ptr addrspa ; ; VI-LABEL: atomic_and_i32_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s8, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -912,11 +912,11 @@ define amdgpu_kernel void @atomic_and_i32_ret(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: atomic_and_i32_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_and v1, v0, v1, s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -931,9 +931,9 @@ entry: define amdgpu_kernel void @atomic_and_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_and_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -948,16 +948,16 @@ define amdgpu_kernel void @atomic_and_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; VI-LABEL: atomic_and_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_atomic_and v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -965,12 +965,12 @@ define amdgpu_kernel void @atomic_and_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; GFX9-LABEL: atomic_and_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -987,9 +987,9 @@ entry: define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_and_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s2, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -1009,20 +1009,20 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_and_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_and v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -1031,12 +1031,12 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_and_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -1055,8 +1055,8 @@ entry: define amdgpu_kernel void @atomic_sub_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_sub_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1068,8 +1068,8 @@ define amdgpu_kernel void @atomic_sub_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; VI-LABEL: atomic_sub_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1081,12 +1081,12 @@ define amdgpu_kernel void @atomic_sub_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; GFX9-LABEL: atomic_sub_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_sub v0, v1, s[0:1] offset:16 +; GFX9-NEXT: global_atomic_sub v0, v1, s[2:3] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -1099,8 +1099,8 @@ entry: define amdgpu_kernel void @atomic_sub_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_sub_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1117,29 +1117,29 @@ define amdgpu_kernel void @atomic_sub_i32_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_sub_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_mov_b32 s0, s6 +; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_sub_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_sub v1, v0, v1, s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -1155,9 +1155,9 @@ entry: define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_sub_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -1172,18 +1172,18 @@ define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_sub_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_atomic_sub v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -1191,12 +1191,12 @@ define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_sub_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -1214,9 +1214,9 @@ entry: define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_sub_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s2, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -1236,22 +1236,22 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_sub_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_sub v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -1260,12 +1260,12 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_sub_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -1285,8 +1285,8 @@ entry: define amdgpu_kernel void @atomic_sub_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_sub_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1298,8 +1298,8 @@ define amdgpu_kernel void @atomic_sub_i32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: atomic_sub_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1311,12 +1311,12 @@ define amdgpu_kernel void @atomic_sub_i32(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: atomic_sub_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_sub v0, v1, s[0:1] +; GFX9-NEXT: global_atomic_sub v0, v1, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -1328,8 +1328,8 @@ entry: define amdgpu_kernel void @atomic_sub_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_sub_i32_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1346,8 +1346,8 @@ define amdgpu_kernel void @atomic_sub_i32_ret(ptr addrspace(1) %out, ptr addrspa ; ; VI-LABEL: atomic_sub_i32_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s8, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1364,11 +1364,11 @@ define amdgpu_kernel void @atomic_sub_i32_ret(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: atomic_sub_i32_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_sub v1, v0, v1, s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -1383,9 +1383,9 @@ entry: define amdgpu_kernel void @atomic_sub_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_sub_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -1400,16 +1400,16 @@ define amdgpu_kernel void @atomic_sub_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; VI-LABEL: atomic_sub_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_atomic_sub v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -1417,12 +1417,12 @@ define amdgpu_kernel void @atomic_sub_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; GFX9-LABEL: atomic_sub_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -1439,9 +1439,9 @@ entry: define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_sub_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s2, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -1461,20 +1461,20 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_sub_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_sub v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -1483,12 +1483,12 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_sub_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -1507,8 +1507,8 @@ entry: define amdgpu_kernel void @atomic_max_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_max_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1520,8 +1520,8 @@ define amdgpu_kernel void @atomic_max_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; VI-LABEL: atomic_max_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1533,12 +1533,12 @@ define amdgpu_kernel void @atomic_max_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; GFX9-LABEL: atomic_max_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_smax v0, v1, s[0:1] offset:16 +; GFX9-NEXT: global_atomic_smax v0, v1, s[2:3] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -1551,8 +1551,8 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_max_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1568,28 +1568,28 @@ define amdgpu_kernel void @atomic_max_i32_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_max_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_mov_b32 s0, s6 +; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: buffer_atomic_smax v0, off, s[4:7], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_max_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_smax v1, v0, v1, s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[6:7] @@ -1604,9 +1604,9 @@ entry: define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_max_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -1619,29 +1619,29 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_max_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_atomic_smax v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_max_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -1657,9 +1657,9 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_max_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s2, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -1678,20 +1678,20 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_max_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_smax v0, v[0:1], v2 glc +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -1701,12 +1701,12 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_max_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -1725,8 +1725,8 @@ entry: define amdgpu_kernel void @atomic_max_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_max_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1736,8 +1736,8 @@ define amdgpu_kernel void @atomic_max_i32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: atomic_max_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1747,12 +1747,12 @@ define amdgpu_kernel void @atomic_max_i32(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: atomic_max_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_smax v0, v1, s[0:1] +; GFX9-NEXT: global_atomic_smax v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: %val = atomicrmw volatile max ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst @@ -1762,8 +1762,8 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_max_i32_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1779,8 +1779,8 @@ define amdgpu_kernel void @atomic_max_i32_ret(ptr addrspace(1) %out, ptr addrspa ; ; VI-LABEL: atomic_max_i32_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s8, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1796,11 +1796,11 @@ define amdgpu_kernel void @atomic_max_i32_ret(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: atomic_max_i32_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_smax v1, v0, v1, s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[6:7] @@ -1814,9 +1814,9 @@ entry: define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_max_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -1829,27 +1829,27 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; VI-LABEL: atomic_max_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_atomic_smax v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_max_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -1864,9 +1864,9 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_max_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s2, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -1885,18 +1885,18 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_max_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_smax v0, v[0:1], v2 glc +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -1906,12 +1906,12 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_max_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -1929,8 +1929,8 @@ entry: define amdgpu_kernel void @atomic_umax_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_umax_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1940,8 +1940,8 @@ define amdgpu_kernel void @atomic_umax_i32_offset(ptr addrspace(1) %out, i32 %in ; ; VI-LABEL: atomic_umax_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1951,12 +1951,12 @@ define amdgpu_kernel void @atomic_umax_i32_offset(ptr addrspace(1) %out, i32 %in ; ; GFX9-LABEL: atomic_umax_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_umax v0, v1, s[0:1] offset:16 +; GFX9-NEXT: global_atomic_umax v0, v1, s[2:3] offset:16 ; GFX9-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 @@ -1967,8 +1967,8 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_umax_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1984,28 +1984,28 @@ define amdgpu_kernel void @atomic_umax_i32_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_umax_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_mov_b32 s0, s6 +; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: buffer_atomic_umax v0, off, s[4:7], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umax_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_umax v1, v0, v1, s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[6:7] @@ -2020,9 +2020,9 @@ entry: define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_umax_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -2035,29 +2035,29 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_umax_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_atomic_umax v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umax_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -2073,9 +2073,9 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_umax_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s2, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -2094,20 +2094,20 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o ; ; VI-LABEL: atomic_umax_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_umax v0, v[0:1], v2 glc +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -2117,12 +2117,12 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o ; ; GFX9-LABEL: atomic_umax_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -2141,8 +2141,8 @@ entry: define amdgpu_kernel void @atomic_umax_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_umax_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2152,8 +2152,8 @@ define amdgpu_kernel void @atomic_umax_i32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: atomic_umax_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2163,12 +2163,12 @@ define amdgpu_kernel void @atomic_umax_i32(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: atomic_umax_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_umax v0, v1, s[0:1] +; GFX9-NEXT: global_atomic_umax v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: %val = atomicrmw volatile umax ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst @@ -2178,8 +2178,8 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_umax_i32_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2195,8 +2195,8 @@ define amdgpu_kernel void @atomic_umax_i32_ret(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: atomic_umax_i32_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s8, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2212,11 +2212,11 @@ define amdgpu_kernel void @atomic_umax_i32_ret(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: atomic_umax_i32_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_umax v1, v0, v1, s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[6:7] @@ -2230,9 +2230,9 @@ entry: define amdgpu_kernel void @atomic_umax_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_umax_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -2245,27 +2245,27 @@ define amdgpu_kernel void @atomic_umax_i32_addr64(ptr addrspace(1) %out, i32 %in ; ; VI-LABEL: atomic_umax_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_atomic_umax v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umax_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -2280,9 +2280,9 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_umax_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s2, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -2301,18 +2301,18 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_umax_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_umax v0, v[0:1], v2 glc +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -2322,12 +2322,12 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_umax_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -2345,8 +2345,8 @@ entry: define amdgpu_kernel void @atomic_min_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_min_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2356,8 +2356,8 @@ define amdgpu_kernel void @atomic_min_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; VI-LABEL: atomic_min_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2367,12 +2367,12 @@ define amdgpu_kernel void @atomic_min_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; GFX9-LABEL: atomic_min_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_smin v0, v1, s[0:1] offset:16 +; GFX9-NEXT: global_atomic_smin v0, v1, s[2:3] offset:16 ; GFX9-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 @@ -2383,8 +2383,8 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_min_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2400,28 +2400,28 @@ define amdgpu_kernel void @atomic_min_i32_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_min_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_mov_b32 s0, s6 +; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: buffer_atomic_smin v0, off, s[4:7], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_min_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_smin v1, v0, v1, s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[6:7] @@ -2436,9 +2436,9 @@ entry: define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_min_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -2451,29 +2451,29 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_min_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_atomic_smin v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_min_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -2489,9 +2489,9 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_min_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s2, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -2510,20 +2510,20 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_min_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_smin v0, v[0:1], v2 glc +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -2533,12 +2533,12 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_min_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -2557,8 +2557,8 @@ entry: define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_min_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2568,8 +2568,8 @@ define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: atomic_min_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2579,12 +2579,12 @@ define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: atomic_min_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_smin v0, v1, s[0:1] +; GFX9-NEXT: global_atomic_smin v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: %val = atomicrmw volatile min ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst @@ -2594,8 +2594,8 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_min_i32_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2611,8 +2611,8 @@ define amdgpu_kernel void @atomic_min_i32_ret(ptr addrspace(1) %out, ptr addrspa ; ; VI-LABEL: atomic_min_i32_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s8, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2628,11 +2628,11 @@ define amdgpu_kernel void @atomic_min_i32_ret(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: atomic_min_i32_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_smin v1, v0, v1, s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[6:7] @@ -2646,9 +2646,9 @@ entry: define amdgpu_kernel void @atomic_min_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_min_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -2661,27 +2661,27 @@ define amdgpu_kernel void @atomic_min_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; VI-LABEL: atomic_min_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_atomic_smin v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_min_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -2696,9 +2696,9 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_min_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s2, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -2717,18 +2717,18 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_min_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_smin v0, v[0:1], v2 glc +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -2738,12 +2738,12 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_min_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -2761,8 +2761,8 @@ entry: define amdgpu_kernel void @atomic_umin_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_umin_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2772,8 +2772,8 @@ define amdgpu_kernel void @atomic_umin_i32_offset(ptr addrspace(1) %out, i32 %in ; ; VI-LABEL: atomic_umin_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2783,12 +2783,12 @@ define amdgpu_kernel void @atomic_umin_i32_offset(ptr addrspace(1) %out, i32 %in ; ; GFX9-LABEL: atomic_umin_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_umin v0, v1, s[0:1] offset:16 +; GFX9-NEXT: global_atomic_umin v0, v1, s[2:3] offset:16 ; GFX9-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 @@ -2799,8 +2799,8 @@ entry: define amdgpu_kernel void @atomic_umin_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_umin_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2816,28 +2816,28 @@ define amdgpu_kernel void @atomic_umin_i32_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_umin_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_mov_b32 s0, s6 +; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: buffer_atomic_umin v0, off, s[4:7], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umin_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_umin v1, v0, v1, s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[6:7] @@ -2852,9 +2852,9 @@ entry: define amdgpu_kernel void @atomic_umin_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_umin_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -2867,29 +2867,29 @@ define amdgpu_kernel void @atomic_umin_i32_addr64_offset(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_umin_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_atomic_umin v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umin_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -2905,9 +2905,9 @@ entry: define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_umin_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s2, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -2926,20 +2926,20 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr addrspace(1) %o ; ; VI-LABEL: atomic_umin_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_umin v0, v[0:1], v2 glc +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -2949,12 +2949,12 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr addrspace(1) %o ; ; GFX9-LABEL: atomic_umin_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -2973,8 +2973,8 @@ entry: define amdgpu_kernel void @atomic_umin_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_umin_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2984,8 +2984,8 @@ define amdgpu_kernel void @atomic_umin_i32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: atomic_umin_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2995,12 +2995,12 @@ define amdgpu_kernel void @atomic_umin_i32(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: atomic_umin_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_umin v0, v1, s[0:1] +; GFX9-NEXT: global_atomic_umin v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: %val = atomicrmw volatile umin ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst @@ -3010,8 +3010,8 @@ entry: define amdgpu_kernel void @atomic_umin_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_umin_i32_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3027,8 +3027,8 @@ define amdgpu_kernel void @atomic_umin_i32_ret(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: atomic_umin_i32_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s8, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3044,11 +3044,11 @@ define amdgpu_kernel void @atomic_umin_i32_ret(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: atomic_umin_i32_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_umin v1, v0, v1, s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[6:7] @@ -3062,9 +3062,9 @@ entry: define amdgpu_kernel void @atomic_umin_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_umin_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -3077,27 +3077,27 @@ define amdgpu_kernel void @atomic_umin_i32_addr64(ptr addrspace(1) %out, i32 %in ; ; VI-LABEL: atomic_umin_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_atomic_umin v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umin_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -3112,9 +3112,9 @@ entry: define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_umin_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s2, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -3133,18 +3133,18 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_umin_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_umin v0, v[0:1], v2 glc +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -3154,12 +3154,12 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_umin_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -3177,8 +3177,8 @@ entry: define amdgpu_kernel void @atomic_or_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_or_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3190,8 +3190,8 @@ define amdgpu_kernel void @atomic_or_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; VI-LABEL: atomic_or_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3203,12 +3203,12 @@ define amdgpu_kernel void @atomic_or_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; GFX9-LABEL: atomic_or_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_or v0, v1, s[0:1] offset:16 +; GFX9-NEXT: global_atomic_or v0, v1, s[2:3] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -3221,8 +3221,8 @@ entry: define amdgpu_kernel void @atomic_or_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_or_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3239,29 +3239,29 @@ define amdgpu_kernel void @atomic_or_i32_ret_offset(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: atomic_or_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_mov_b32 s0, s6 +; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: buffer_atomic_or v0, off, s[4:7], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_or_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_or v1, v0, v1, s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -3277,9 +3277,9 @@ entry: define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_or_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -3294,18 +3294,18 @@ define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr addrspace(1) %out, i3 ; ; VI-LABEL: atomic_or_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_atomic_or v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -3313,12 +3313,12 @@ define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr addrspace(1) %out, i3 ; ; GFX9-LABEL: atomic_or_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -3336,9 +3336,9 @@ entry: define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_or_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s2, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -3358,22 +3358,22 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(ptr addrspace(1) %out ; ; VI-LABEL: atomic_or_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_or v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -3382,12 +3382,12 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(ptr addrspace(1) %out ; ; GFX9-LABEL: atomic_or_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -3407,8 +3407,8 @@ entry: define amdgpu_kernel void @atomic_or_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_or_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3420,8 +3420,8 @@ define amdgpu_kernel void @atomic_or_i32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: atomic_or_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3433,12 +3433,12 @@ define amdgpu_kernel void @atomic_or_i32(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: atomic_or_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_or v0, v1, s[0:1] +; GFX9-NEXT: global_atomic_or v0, v1, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -3450,8 +3450,8 @@ entry: define amdgpu_kernel void @atomic_or_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_or_i32_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3468,8 +3468,8 @@ define amdgpu_kernel void @atomic_or_i32_ret(ptr addrspace(1) %out, ptr addrspac ; ; VI-LABEL: atomic_or_i32_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s8, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3486,11 +3486,11 @@ define amdgpu_kernel void @atomic_or_i32_ret(ptr addrspace(1) %out, ptr addrspac ; ; GFX9-LABEL: atomic_or_i32_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_or v1, v0, v1, s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -3505,9 +3505,9 @@ entry: define amdgpu_kernel void @atomic_or_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_or_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -3522,16 +3522,16 @@ define amdgpu_kernel void @atomic_or_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; VI-LABEL: atomic_or_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_atomic_or v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -3539,12 +3539,12 @@ define amdgpu_kernel void @atomic_or_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; GFX9-LABEL: atomic_or_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -3561,9 +3561,9 @@ entry: define amdgpu_kernel void @atomic_or_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_or_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s2, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -3583,20 +3583,20 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: atomic_or_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_or v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -3605,12 +3605,12 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64(ptr addrspace(1) %out, ptr a ; ; GFX9-LABEL: atomic_or_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -3629,8 +3629,8 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_xchg_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3642,8 +3642,8 @@ define amdgpu_kernel void @atomic_xchg_i32_offset(ptr addrspace(1) %out, i32 %in ; ; VI-LABEL: atomic_xchg_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3655,12 +3655,12 @@ define amdgpu_kernel void @atomic_xchg_i32_offset(ptr addrspace(1) %out, i32 %in ; ; GFX9-LABEL: atomic_xchg_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_swap v0, v1, s[0:1] offset:16 +; GFX9-NEXT: global_atomic_swap v0, v1, s[2:3] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -3673,8 +3673,8 @@ entry: define amdgpu_kernel void @atomic_xchg_f32_offset(ptr addrspace(1) %out, float %in) { ; SI-LABEL: atomic_xchg_f32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3686,8 +3686,8 @@ define amdgpu_kernel void @atomic_xchg_f32_offset(ptr addrspace(1) %out, float % ; ; VI-LABEL: atomic_xchg_f32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3699,12 +3699,12 @@ define amdgpu_kernel void @atomic_xchg_f32_offset(ptr addrspace(1) %out, float % ; ; GFX9-LABEL: atomic_xchg_f32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_swap v0, v1, s[0:1] offset:16 +; GFX9-NEXT: global_atomic_swap v0, v1, s[2:3] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -3717,8 +3717,8 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_xchg_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3735,29 +3735,29 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_xchg_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_mov_b32 s0, s6 +; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_xchg_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_swap v1, v0, v1, s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -3773,9 +3773,9 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_xchg_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -3790,18 +3790,18 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_xchg_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_atomic_swap v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -3809,12 +3809,12 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(ptr addrspace(1) %out, ; ; GFX9-LABEL: atomic_xchg_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -3832,9 +3832,9 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_xchg_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s2, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -3854,22 +3854,22 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(ptr addrspace(1) %o ; ; VI-LABEL: atomic_xchg_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_swap v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -3878,12 +3878,12 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(ptr addrspace(1) %o ; ; GFX9-LABEL: atomic_xchg_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -3903,8 +3903,8 @@ entry: define amdgpu_kernel void @atomic_xchg_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_xchg_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3916,8 +3916,8 @@ define amdgpu_kernel void @atomic_xchg_i32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: atomic_xchg_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3929,12 +3929,12 @@ define amdgpu_kernel void @atomic_xchg_i32(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: atomic_xchg_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX9-NEXT: global_atomic_swap v0, v1, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -3946,8 +3946,8 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_xchg_i32_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3964,8 +3964,8 @@ define amdgpu_kernel void @atomic_xchg_i32_ret(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: atomic_xchg_i32_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s8, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3982,11 +3982,11 @@ define amdgpu_kernel void @atomic_xchg_i32_ret(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: atomic_xchg_i32_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -4001,9 +4001,9 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_xchg_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -4018,16 +4018,16 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64(ptr addrspace(1) %out, i32 %in ; ; VI-LABEL: atomic_xchg_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_atomic_swap v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -4035,12 +4035,12 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64(ptr addrspace(1) %out, i32 %in ; ; GFX9-LABEL: atomic_xchg_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -4057,9 +4057,9 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_xchg_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s2, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -4079,20 +4079,20 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_xchg_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_swap v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -4101,12 +4101,12 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_xchg_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -4125,7 +4125,7 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr addrspace(1) %out, i32 %in, i32 %old) { ; SI-LABEL: atomic_cmpxchg_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4140,7 +4140,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr addrspace(1) %out, i32 ; ; VI-LABEL: atomic_cmpxchg_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -4155,12 +4155,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr addrspace(1) %out, i32 ; ; GFX9-LABEL: atomic_cmpxchg_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -4173,8 +4173,8 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %old) { ; SI-LABEL: atomic_cmpxchg_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4192,31 +4192,31 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_cmpxchg_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_mov_b32 s0, s6 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_cmpxchg_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -4233,10 +4233,10 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index, i32 %old) { ; SI-LABEL: atomic_cmpxchg_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s7, s[2:3], 0xf -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s7, s[0:1], 0xf +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -4252,19 +4252,19 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_cmpxchg_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dword s6, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x3c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s7, s[0:1], 0x3c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_cmpswap v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -4273,17 +4273,17 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_cmpxchg_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s7, s[0:1], 0x3c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x3c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -4298,10 +4298,10 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index, i32 %old) { ; SI-LABEL: atomic_cmpxchg_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s10, s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0x11 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s10, s[0:1], 0x11 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -4309,8 +4309,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr addrspace(1) ; SI-NEXT: s_lshl_b64 s[8:9], s[8:9], 2 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: v_mov_b32_e32 v0, s10 -; SI-NEXT: v_mov_b32_e32 v1, s2 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v1, s10 ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_mov_b32_e32 v3, s9 ; SI-NEXT: buffer_atomic_cmpswap v[0:1], v[2:3], s[4:7], 0 addr64 offset:16 glc @@ -4322,24 +4322,24 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr addrspace(1) ; ; VI-LABEL: atomic_cmpxchg_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dword s8, s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x44 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s9, s[0:1], 0x44 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -4348,13 +4348,13 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr addrspace(1) ; ; GFX9-LABEL: atomic_cmpxchg_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 -; GFX9-NEXT: s_load_dword s9, s[2:3], 0x44 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s9, s[0:1], 0x44 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, s8 @@ -4376,7 +4376,7 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32(ptr addrspace(1) %out, i32 %in, i32 %old) { ; SI-LABEL: atomic_cmpxchg_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4391,7 +4391,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32(ptr addrspace(1) %out, i32 %in, i3 ; ; VI-LABEL: atomic_cmpxchg_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -4406,12 +4406,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32(ptr addrspace(1) %out, i32 %in, i3 ; ; GFX9-LABEL: atomic_cmpxchg_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -4423,8 +4423,8 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %old) { ; SI-LABEL: atomic_cmpxchg_i32_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4442,8 +4442,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret(ptr addrspace(1) %out, ptr add ; ; VI-LABEL: atomic_cmpxchg_i32_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -4461,12 +4461,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: atomic_cmpxchg_i32_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -4482,10 +4482,10 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index, i32 %old) { ; SI-LABEL: atomic_cmpxchg_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s7, s[2:3], 0xf -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s7, s[0:1], 0xf +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -4501,17 +4501,17 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr addrspace(1) %out, i32 ; ; VI-LABEL: atomic_cmpxchg_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dword s6, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x3c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s7, s[0:1], 0x3c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_cmpswap v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -4520,17 +4520,17 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr addrspace(1) %out, i32 ; ; GFX9-LABEL: atomic_cmpxchg_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s7, s[0:1], 0x3c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x3c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -4544,10 +4544,10 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index, i32 %old) { ; SI-LABEL: atomic_cmpxchg_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s10, s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0x11 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s10, s[0:1], 0x11 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -4555,8 +4555,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr addrspace(1) %out, ; SI-NEXT: s_lshl_b64 s[8:9], s[8:9], 2 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: v_mov_b32_e32 v0, s10 -; SI-NEXT: v_mov_b32_e32 v1, s2 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v1, s10 ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_mov_b32_e32 v3, s9 ; SI-NEXT: buffer_atomic_cmpswap v[0:1], v[2:3], s[4:7], 0 addr64 glc @@ -4568,22 +4568,22 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_cmpxchg_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dword s8, s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x44 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s9, s[0:1], 0x44 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -4592,13 +4592,13 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr addrspace(1) %out, ; ; GFX9-LABEL: atomic_cmpxchg_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 -; GFX9-NEXT: s_load_dword s9, s[2:3], 0x44 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s9, s[0:1], 0x44 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, s8 @@ -4619,8 +4619,8 @@ entry: define amdgpu_kernel void @atomic_xor_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_xor_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4632,8 +4632,8 @@ define amdgpu_kernel void @atomic_xor_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; VI-LABEL: atomic_xor_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -4645,12 +4645,12 @@ define amdgpu_kernel void @atomic_xor_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; GFX9-LABEL: atomic_xor_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_xor v0, v1, s[0:1] offset:16 +; GFX9-NEXT: global_atomic_xor v0, v1, s[2:3] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -4663,8 +4663,8 @@ entry: define amdgpu_kernel void @atomic_xor_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_xor_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4681,29 +4681,29 @@ define amdgpu_kernel void @atomic_xor_i32_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_xor_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_mov_b32 s0, s6 +; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: buffer_atomic_xor v0, off, s[4:7], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_xor_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_xor v1, v0, v1, s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -4719,9 +4719,9 @@ entry: define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_xor_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -4736,18 +4736,18 @@ define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_xor_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_atomic_xor v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -4755,12 +4755,12 @@ define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_xor_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -4778,9 +4778,9 @@ entry: define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_xor_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s2, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -4800,22 +4800,22 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_xor_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_xor v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -4824,12 +4824,12 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_xor_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -4849,8 +4849,8 @@ entry: define amdgpu_kernel void @atomic_xor_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_xor_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4862,8 +4862,8 @@ define amdgpu_kernel void @atomic_xor_i32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: atomic_xor_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -4875,12 +4875,12 @@ define amdgpu_kernel void @atomic_xor_i32(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: atomic_xor_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_xor v0, v1, s[0:1] +; GFX9-NEXT: global_atomic_xor v0, v1, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -4892,8 +4892,8 @@ entry: define amdgpu_kernel void @atomic_xor_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_xor_i32_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4910,8 +4910,8 @@ define amdgpu_kernel void @atomic_xor_i32_ret(ptr addrspace(1) %out, ptr addrspa ; ; VI-LABEL: atomic_xor_i32_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s8, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -4928,11 +4928,11 @@ define amdgpu_kernel void @atomic_xor_i32_ret(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: atomic_xor_i32_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_xor v1, v0, v1, s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -4947,9 +4947,9 @@ entry: define amdgpu_kernel void @atomic_xor_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_xor_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -4964,16 +4964,16 @@ define amdgpu_kernel void @atomic_xor_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; VI-LABEL: atomic_xor_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_atomic_xor v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -4981,12 +4981,12 @@ define amdgpu_kernel void @atomic_xor_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; GFX9-LABEL: atomic_xor_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -5003,9 +5003,9 @@ entry: define amdgpu_kernel void @atomic_xor_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_xor_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s2, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -5025,20 +5025,20 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_xor_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_xor v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -5047,12 +5047,12 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_xor_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -5071,7 +5071,7 @@ entry: define amdgpu_kernel void @atomic_load_i32_offset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -5087,7 +5087,7 @@ define amdgpu_kernel void @atomic_load_i32_offset(ptr addrspace(1) %in, ptr addr ; ; VI-LABEL: atomic_load_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5105,7 +5105,7 @@ define amdgpu_kernel void @atomic_load_i32_offset(ptr addrspace(1) %in, ptr addr ; ; GFX9-LABEL: atomic_load_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] offset:16 glc @@ -5123,7 +5123,7 @@ entry: define amdgpu_kernel void @atomic_load_i32_negoffset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_i32_negoffset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, s2 @@ -5141,7 +5141,7 @@ define amdgpu_kernel void @atomic_load_i32_negoffset(ptr addrspace(1) %in, ptr a ; ; VI-LABEL: atomic_load_i32_negoffset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5159,7 +5159,7 @@ define amdgpu_kernel void @atomic_load_i32_negoffset(ptr addrspace(1) %in, ptr a ; ; GFX9-LABEL: atomic_load_i32_negoffset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] offset:-512 glc @@ -5177,7 +5177,7 @@ entry: define amdgpu_kernel void @atomic_load_f32_offset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_f32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -5193,7 +5193,7 @@ define amdgpu_kernel void @atomic_load_f32_offset(ptr addrspace(1) %in, ptr addr ; ; VI-LABEL: atomic_load_f32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5211,7 +5211,7 @@ define amdgpu_kernel void @atomic_load_f32_offset(ptr addrspace(1) %in, ptr addr ; ; GFX9-LABEL: atomic_load_f32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] offset:16 glc @@ -5229,7 +5229,7 @@ entry: define amdgpu_kernel void @atomic_load_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -5245,7 +5245,7 @@ define amdgpu_kernel void @atomic_load_i32(ptr addrspace(1) %in, ptr addrspace(1 ; ; VI-LABEL: atomic_load_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5261,7 +5261,7 @@ define amdgpu_kernel void @atomic_load_i32(ptr addrspace(1) %in, ptr addrspace(1 ; ; GFX9-LABEL: atomic_load_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc @@ -5278,8 +5278,8 @@ entry: define amdgpu_kernel void @atomic_load_i32_addr64_offset(ptr addrspace(1) %in, ptr addrspace(1) %out, i64 %index) { ; SI-LABEL: atomic_load_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -5298,8 +5298,8 @@ define amdgpu_kernel void @atomic_load_i32_addr64_offset(ptr addrspace(1) %in, p ; ; VI-LABEL: atomic_load_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5320,11 +5320,11 @@ define amdgpu_kernel void @atomic_load_i32_addr64_offset(ptr addrspace(1) %in, p ; ; GFX9-LABEL: atomic_load_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] offset:16 glc @@ -5343,8 +5343,8 @@ entry: define amdgpu_kernel void @atomic_load_i32_addr64(ptr addrspace(1) %in, ptr addrspace(1) %out, i64 %index) { ; SI-LABEL: atomic_load_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -5363,8 +5363,8 @@ define amdgpu_kernel void @atomic_load_i32_addr64(ptr addrspace(1) %in, ptr addr ; ; VI-LABEL: atomic_load_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5383,11 +5383,11 @@ define amdgpu_kernel void @atomic_load_i32_addr64(ptr addrspace(1) %in, ptr addr ; ; GFX9-LABEL: atomic_load_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc @@ -5405,8 +5405,8 @@ entry: define amdgpu_kernel void @atomic_load_f32_addr64_offset(ptr addrspace(1) %in, ptr addrspace(1) %out, i64 %index) { ; SI-LABEL: atomic_load_f32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -5425,8 +5425,8 @@ define amdgpu_kernel void @atomic_load_f32_addr64_offset(ptr addrspace(1) %in, p ; ; VI-LABEL: atomic_load_f32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5447,11 +5447,11 @@ define amdgpu_kernel void @atomic_load_f32_addr64_offset(ptr addrspace(1) %in, p ; ; GFX9-LABEL: atomic_load_f32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] offset:16 glc @@ -5470,8 +5470,8 @@ entry: define amdgpu_kernel void @atomic_store_i32_offset(i32 %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_store_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; SI-NEXT: s_load_dword s4, s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -5481,25 +5481,25 @@ define amdgpu_kernel void @atomic_store_i32_offset(i32 %in, ptr addrspace(1) %ou ; ; VI-LABEL: atomic_store_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; VI-NEXT: s_load_dword s2, s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dword s4, s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s0, 16 -; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: s_add_u32 s0, s2, 16 +; VI-NEXT: s_addc_u32 s1, s3, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_store_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] offset:16 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] offset:16 ; GFX9-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 @@ -5510,8 +5510,8 @@ entry: define amdgpu_kernel void @atomic_store_i32(i32 %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_store_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; SI-NEXT: s_load_dword s4, s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -5521,23 +5521,23 @@ define amdgpu_kernel void @atomic_store_i32(i32 %in, ptr addrspace(1) %out) { ; ; VI-LABEL: atomic_store_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; VI-NEXT: s_load_dword s2, s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dword s0, s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_store_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: store atomic i32 %in, ptr addrspace(1) %out seq_cst, align 4 @@ -5547,8 +5547,8 @@ entry: define amdgpu_kernel void @atomic_store_f32(float %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_store_f32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; SI-NEXT: s_load_dword s4, s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -5558,23 +5558,23 @@ define amdgpu_kernel void @atomic_store_f32(float %in, ptr addrspace(1) %out) { ; ; VI-LABEL: atomic_store_f32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; VI-NEXT: s_load_dword s2, s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dword s0, s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_store_f32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: store atomic float %in, ptr addrspace(1) %out seq_cst, align 4 @@ -5584,8 +5584,8 @@ entry: define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, ptr addrspace(1) %out, i64 %index) { ; SI-LABEL: atomic_store_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dword s2, s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; SI-NEXT: s_load_dword s2, s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -5598,8 +5598,8 @@ define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, ptr addrspace ; ; VI-LABEL: atomic_store_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; VI-NEXT: s_load_dword s2, s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 @@ -5614,14 +5614,14 @@ define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, ptr addrspace ; ; GFX9-LABEL: atomic_store_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_endpgm entry: @@ -5634,8 +5634,8 @@ entry: define amdgpu_kernel void @atomic_store_f32_addr64_offset(float %in, ptr addrspace(1) %out, i64 %index) { ; SI-LABEL: atomic_store_f32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dword s2, s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; SI-NEXT: s_load_dword s2, s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -5648,8 +5648,8 @@ define amdgpu_kernel void @atomic_store_f32_addr64_offset(float %in, ptr addrspa ; ; VI-LABEL: atomic_store_f32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; VI-NEXT: s_load_dword s2, s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 @@ -5664,14 +5664,14 @@ define amdgpu_kernel void @atomic_store_f32_addr64_offset(float %in, ptr addrspa ; ; GFX9-LABEL: atomic_store_f32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_endpgm entry: @@ -5684,8 +5684,8 @@ entry: define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, ptr addrspace(1) %out, i64 %index) { ; SI-LABEL: atomic_store_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dword s8, s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; SI-NEXT: s_load_dword s8, s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], 2 @@ -5699,8 +5699,8 @@ define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, ptr addrspace(1) %ou ; ; VI-LABEL: atomic_store_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; VI-NEXT: s_load_dword s2, s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 @@ -5713,14 +5713,14 @@ define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_store_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm entry: @@ -5732,8 +5732,8 @@ entry: define amdgpu_kernel void @atomic_store_f32_addr64(float %in, ptr addrspace(1) %out, i64 %index) { ; SI-LABEL: atomic_store_f32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dword s8, s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; SI-NEXT: s_load_dword s8, s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], 2 @@ -5747,8 +5747,8 @@ define amdgpu_kernel void @atomic_store_f32_addr64(float %in, ptr addrspace(1) % ; ; VI-LABEL: atomic_store_f32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; VI-NEXT: s_load_dword s2, s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 @@ -5761,14 +5761,14 @@ define amdgpu_kernel void @atomic_store_f32_addr64(float %in, ptr addrspace(1) % ; ; GFX9-LABEL: atomic_store_f32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm entry: @@ -5780,7 +5780,7 @@ entry: define amdgpu_kernel void @atomic_load_i8_offset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_i8_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -5796,7 +5796,7 @@ define amdgpu_kernel void @atomic_load_i8_offset(ptr addrspace(1) %in, ptr addrs ; ; VI-LABEL: atomic_load_i8_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5812,7 +5812,7 @@ define amdgpu_kernel void @atomic_load_i8_offset(ptr addrspace(1) %in, ptr addrs ; ; GFX9-LABEL: atomic_load_i8_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v1, v0, s[0:1] offset:16 glc @@ -5830,7 +5830,7 @@ entry: define amdgpu_kernel void @atomic_load_i8_negoffset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_i8_negoffset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, s2 @@ -5848,7 +5848,7 @@ define amdgpu_kernel void @atomic_load_i8_negoffset(ptr addrspace(1) %in, ptr ad ; ; VI-LABEL: atomic_load_i8_negoffset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5866,7 +5866,7 @@ define amdgpu_kernel void @atomic_load_i8_negoffset(ptr addrspace(1) %in, ptr ad ; ; GFX9-LABEL: atomic_load_i8_negoffset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v1, v0, s[0:1] offset:-512 glc @@ -5884,8 +5884,8 @@ entry: define amdgpu_kernel void @atomic_store_i8_offset(i8 %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_store_i8_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; SI-NEXT: s_load_dword s4, s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -5895,25 +5895,25 @@ define amdgpu_kernel void @atomic_store_i8_offset(i8 %in, ptr addrspace(1) %out) ; ; VI-LABEL: atomic_store_i8_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; VI-NEXT: s_load_dword s2, s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dword s4, s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s0, 16 -; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: s_add_u32 s0, s2, 16 +; VI-NEXT: s_addc_u32 s1, s3, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: flat_store_byte v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_store_i8_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_store_byte v0, v1, s[0:1] offset:16 +; GFX9-NEXT: global_store_byte v0, v1, s[2:3] offset:16 ; GFX9-NEXT: s_endpgm entry: %gep = getelementptr i8, ptr addrspace(1) %out, i64 16 @@ -5924,8 +5924,8 @@ entry: define amdgpu_kernel void @atomic_store_i8(i8 %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_store_i8: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; SI-NEXT: s_load_dword s4, s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -5935,23 +5935,23 @@ define amdgpu_kernel void @atomic_store_i8(i8 %in, ptr addrspace(1) %out) { ; ; VI-LABEL: atomic_store_i8: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; VI-NEXT: s_load_dword s2, s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dword s0, s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_byte v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_store_i8: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: store atomic i8 %in, ptr addrspace(1) %out seq_cst, align 1 @@ -5961,7 +5961,7 @@ entry: define amdgpu_kernel void @atomic_load_i16_offset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_i16_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -5977,7 +5977,7 @@ define amdgpu_kernel void @atomic_load_i16_offset(ptr addrspace(1) %in, ptr addr ; ; VI-LABEL: atomic_load_i16_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5993,7 +5993,7 @@ define amdgpu_kernel void @atomic_load_i16_offset(ptr addrspace(1) %in, ptr addr ; ; GFX9-LABEL: atomic_load_i16_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] offset:16 glc @@ -6011,7 +6011,7 @@ entry: define amdgpu_kernel void @atomic_load_i16_negoffset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_i16_negoffset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, s2 @@ -6029,7 +6029,7 @@ define amdgpu_kernel void @atomic_load_i16_negoffset(ptr addrspace(1) %in, ptr a ; ; VI-LABEL: atomic_load_i16_negoffset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -6047,7 +6047,7 @@ define amdgpu_kernel void @atomic_load_i16_negoffset(ptr addrspace(1) %in, ptr a ; ; GFX9-LABEL: atomic_load_i16_negoffset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] offset:-512 glc @@ -6065,8 +6065,8 @@ entry: define amdgpu_kernel void @atomic_store_i16_offset(i16 %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_store_i16_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; SI-NEXT: s_load_dword s4, s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -6076,25 +6076,25 @@ define amdgpu_kernel void @atomic_store_i16_offset(i16 %in, ptr addrspace(1) %ou ; ; VI-LABEL: atomic_store_i16_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; VI-NEXT: s_load_dword s2, s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dword s4, s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s0, 16 -; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: s_add_u32 s0, s2, 16 +; VI-NEXT: s_addc_u32 s1, s3, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_store_i16_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] offset:16 +; GFX9-NEXT: global_store_short v0, v1, s[2:3] offset:16 ; GFX9-NEXT: s_endpgm entry: %gep = getelementptr i16, ptr addrspace(1) %out, i64 8 @@ -6105,8 +6105,8 @@ entry: define amdgpu_kernel void @atomic_store_i16(i16 %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_store_i16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; SI-NEXT: s_load_dword s4, s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -6116,23 +6116,23 @@ define amdgpu_kernel void @atomic_store_i16(i16 %in, ptr addrspace(1) %out) { ; ; VI-LABEL: atomic_store_i16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; VI-NEXT: s_load_dword s2, s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dword s0, s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_store_i16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: store atomic i16 %in, ptr addrspace(1) %out seq_cst, align 2 @@ -6142,8 +6142,8 @@ entry: define amdgpu_kernel void @atomic_store_f16_offset(half %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_store_f16_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; SI-NEXT: s_load_dword s4, s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -6153,25 +6153,25 @@ define amdgpu_kernel void @atomic_store_f16_offset(half %in, ptr addrspace(1) %o ; ; VI-LABEL: atomic_store_f16_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; VI-NEXT: s_load_dword s2, s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dword s4, s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s0, 16 -; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: s_add_u32 s0, s2, 16 +; VI-NEXT: s_addc_u32 s1, s3, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_store_f16_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] offset:16 +; GFX9-NEXT: global_store_short v0, v1, s[2:3] offset:16 ; GFX9-NEXT: s_endpgm entry: %gep = getelementptr half, ptr addrspace(1) %out, i64 8 @@ -6182,8 +6182,8 @@ entry: define amdgpu_kernel void @atomic_store_f16(half %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_store_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; SI-NEXT: s_load_dword s4, s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -6193,23 +6193,23 @@ define amdgpu_kernel void @atomic_store_f16(half %in, ptr addrspace(1) %out) { ; ; VI-LABEL: atomic_store_f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; VI-NEXT: s_load_dword s2, s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dword s0, s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_store_f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: store atomic half %in, ptr addrspace(1) %out seq_cst, align 2 @@ -6219,8 +6219,8 @@ entry: define amdgpu_kernel void @atomic_store_bf16_offset(bfloat %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_store_bf16_offset: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; SI-NEXT: s_load_dword s4, s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -6230,25 +6230,25 @@ define amdgpu_kernel void @atomic_store_bf16_offset(bfloat %in, ptr addrspace(1) ; ; VI-LABEL: atomic_store_bf16_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; VI-NEXT: s_load_dword s2, s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dword s4, s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s0, 16 -; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: s_add_u32 s0, s2, 16 +; VI-NEXT: s_addc_u32 s1, s3, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_store_bf16_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] offset:16 +; GFX9-NEXT: global_store_short v0, v1, s[2:3] offset:16 ; GFX9-NEXT: s_endpgm %gep = getelementptr bfloat, ptr addrspace(1) %out, i64 8 store atomic bfloat %in, ptr addrspace(1) %gep seq_cst, align 2 @@ -6258,8 +6258,8 @@ define amdgpu_kernel void @atomic_store_bf16_offset(bfloat %in, ptr addrspace(1) define amdgpu_kernel void @atomic_store_bf16(bfloat %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_store_bf16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; SI-NEXT: s_load_dword s4, s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -6269,23 +6269,23 @@ define amdgpu_kernel void @atomic_store_bf16(bfloat %in, ptr addrspace(1) %out) ; ; VI-LABEL: atomic_store_bf16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; VI-NEXT: s_load_dword s2, s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dword s0, s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_store_bf16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm store atomic bfloat %in, ptr addrspace(1) %out seq_cst, align 2 ret void @@ -6294,8 +6294,8 @@ define amdgpu_kernel void @atomic_store_bf16(bfloat %in, ptr addrspace(1) %out) define amdgpu_kernel void @atomic_inc_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_inc_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -6307,8 +6307,8 @@ define amdgpu_kernel void @atomic_inc_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; VI-LABEL: atomic_inc_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -6320,12 +6320,12 @@ define amdgpu_kernel void @atomic_inc_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; GFX9-LABEL: atomic_inc_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_inc v0, v1, s[0:1] offset:16 +; GFX9-NEXT: global_atomic_inc v0, v1, s[2:3] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -6338,8 +6338,8 @@ entry: define amdgpu_kernel void @atomic_inc_i32_max_neg_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_inc_i32_max_neg_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_mov_b32_e32 v0, 0xfffff000 @@ -6353,14 +6353,14 @@ define amdgpu_kernel void @atomic_inc_i32_max_neg_offset(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_inc_i32_max_neg_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s0, 0xfffff000 -; VI-NEXT: s_addc_u32 s1, s1, -1 +; VI-NEXT: s_add_u32 s0, s2, 0xfffff000 +; VI-NEXT: s_addc_u32 s1, s3, -1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: flat_atomic_inc v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -6368,12 +6368,12 @@ define amdgpu_kernel void @atomic_inc_i32_max_neg_offset(ptr addrspace(1) %out, ; ; GFX9-LABEL: atomic_inc_i32_max_neg_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_inc v0, v1, s[0:1] offset:-4096 +; GFX9-NEXT: global_atomic_inc v0, v1, s[2:3] offset:-4096 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -6386,8 +6386,8 @@ entry: define amdgpu_kernel void @atomic_inc_i32_soffset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_inc_i32_soffset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s5, 0x8ca0 @@ -6400,8 +6400,8 @@ define amdgpu_kernel void @atomic_inc_i32_soffset(ptr addrspace(1) %out, i32 %in ; ; VI-LABEL: atomic_inc_i32_soffset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s5, 0x8ca0 @@ -6414,12 +6414,12 @@ define amdgpu_kernel void @atomic_inc_i32_soffset(ptr addrspace(1) %out, i32 %in ; ; GFX9-LABEL: atomic_inc_i32_soffset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x8000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_inc v0, v1, s[0:1] offset:3232 +; GFX9-NEXT: global_atomic_inc v0, v1, s[2:3] offset:3232 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -6432,29 +6432,29 @@ entry: define amdgpu_kernel void @atomic_inc_i32_huge_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_inc_i32_huge_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_mov_b32_e32 v0, 0xdeac ; SI-NEXT: v_mov_b32_e32 v1, 0xabcd ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s4 -; SI-NEXT: buffer_atomic_inc v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: v_mov_b32_e32 v2, s0 +; SI-NEXT: buffer_atomic_inc v2, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_inc_i32_huge_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s0, 0xdeac -; VI-NEXT: s_addc_u32 s1, s1, 0xabcd +; VI-NEXT: s_add_u32 s0, s2, 0xdeac +; VI-NEXT: s_addc_u32 s1, s3, 0xabcd ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: flat_atomic_inc v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -6462,12 +6462,12 @@ define amdgpu_kernel void @atomic_inc_i32_huge_offset(ptr addrspace(1) %out, i32 ; ; GFX9-LABEL: atomic_inc_i32_huge_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_u32 s0, s0, 0xdeac -; GFX9-NEXT: s_addc_u32 s1, s1, 0xabcd +; GFX9-NEXT: s_add_u32 s0, s2, 0xdeac +; GFX9-NEXT: s_addc_u32 s1, s3, 0xabcd ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: global_atomic_inc v0, v1, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -6482,8 +6482,8 @@ entry: define amdgpu_kernel void @atomic_inc_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_inc_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -6500,29 +6500,29 @@ define amdgpu_kernel void @atomic_inc_i32_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_inc_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_mov_b32 s0, s6 +; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: buffer_atomic_inc v0, off, s[4:7], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_inc_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_inc v1, v0, v1, s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -6538,9 +6538,9 @@ entry: define amdgpu_kernel void @atomic_inc_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_inc_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -6555,18 +6555,18 @@ define amdgpu_kernel void @atomic_inc_i32_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_inc_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_atomic_inc v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -6574,12 +6574,12 @@ define amdgpu_kernel void @atomic_inc_i32_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_inc_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -6597,9 +6597,9 @@ entry: define amdgpu_kernel void @atomic_inc_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_inc_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s2, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -6619,22 +6619,22 @@ define amdgpu_kernel void @atomic_inc_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_inc_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -6643,12 +6643,12 @@ define amdgpu_kernel void @atomic_inc_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_inc_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -6668,8 +6668,8 @@ entry: define amdgpu_kernel void @atomic_dec_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_dec_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -6681,8 +6681,8 @@ define amdgpu_kernel void @atomic_dec_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; VI-LABEL: atomic_dec_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -6694,12 +6694,12 @@ define amdgpu_kernel void @atomic_dec_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; GFX9-LABEL: atomic_dec_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_dec v0, v1, s[0:1] offset:16 +; GFX9-NEXT: global_atomic_dec v0, v1, s[2:3] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -6712,8 +6712,8 @@ entry: define amdgpu_kernel void @atomic_dec_i32_max_neg_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_dec_i32_max_neg_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_mov_b32_e32 v0, 0xfffff000 @@ -6727,14 +6727,14 @@ define amdgpu_kernel void @atomic_dec_i32_max_neg_offset(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_dec_i32_max_neg_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s0, 0xfffff000 -; VI-NEXT: s_addc_u32 s1, s1, -1 +; VI-NEXT: s_add_u32 s0, s2, 0xfffff000 +; VI-NEXT: s_addc_u32 s1, s3, -1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: flat_atomic_dec v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -6742,12 +6742,12 @@ define amdgpu_kernel void @atomic_dec_i32_max_neg_offset(ptr addrspace(1) %out, ; ; GFX9-LABEL: atomic_dec_i32_max_neg_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_dec v0, v1, s[0:1] offset:-4096 +; GFX9-NEXT: global_atomic_dec v0, v1, s[2:3] offset:-4096 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -6760,8 +6760,8 @@ entry: define amdgpu_kernel void @atomic_dec_i32_soffset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_dec_i32_soffset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s5, 0x8ca0 @@ -6774,8 +6774,8 @@ define amdgpu_kernel void @atomic_dec_i32_soffset(ptr addrspace(1) %out, i32 %in ; ; VI-LABEL: atomic_dec_i32_soffset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s5, 0x8ca0 @@ -6788,12 +6788,12 @@ define amdgpu_kernel void @atomic_dec_i32_soffset(ptr addrspace(1) %out, i32 %in ; ; GFX9-LABEL: atomic_dec_i32_soffset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x8000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_dec v0, v1, s[0:1] offset:3232 +; GFX9-NEXT: global_atomic_dec v0, v1, s[2:3] offset:3232 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -6806,29 +6806,29 @@ entry: define amdgpu_kernel void @atomic_dec_i32_huge_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_dec_i32_huge_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_mov_b32_e32 v0, 0xdeac ; SI-NEXT: v_mov_b32_e32 v1, 0xabcd ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s4 -; SI-NEXT: buffer_atomic_dec v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: v_mov_b32_e32 v2, s0 +; SI-NEXT: buffer_atomic_dec v2, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_dec_i32_huge_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s0, 0xdeac -; VI-NEXT: s_addc_u32 s1, s1, 0xabcd +; VI-NEXT: s_add_u32 s0, s2, 0xdeac +; VI-NEXT: s_addc_u32 s1, s3, 0xabcd ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: flat_atomic_dec v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -6836,12 +6836,12 @@ define amdgpu_kernel void @atomic_dec_i32_huge_offset(ptr addrspace(1) %out, i32 ; ; GFX9-LABEL: atomic_dec_i32_huge_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_u32 s0, s0, 0xdeac -; GFX9-NEXT: s_addc_u32 s1, s1, 0xabcd +; GFX9-NEXT: s_add_u32 s0, s2, 0xdeac +; GFX9-NEXT: s_addc_u32 s1, s3, 0xabcd ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: global_atomic_dec v0, v1, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -6856,8 +6856,8 @@ entry: define amdgpu_kernel void @atomic_dec_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_dec_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -6874,29 +6874,29 @@ define amdgpu_kernel void @atomic_dec_i32_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_dec_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_mov_b32 s0, s6 +; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: buffer_atomic_dec v0, off, s[4:7], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_dec_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_dec v1, v0, v1, s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -6912,9 +6912,9 @@ entry: define amdgpu_kernel void @atomic_dec_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_dec_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -6929,18 +6929,18 @@ define amdgpu_kernel void @atomic_dec_i32_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_dec_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_atomic_dec v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -6948,12 +6948,12 @@ define amdgpu_kernel void @atomic_dec_i32_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_dec_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -6971,9 +6971,9 @@ entry: define amdgpu_kernel void @atomic_dec_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_dec_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s2, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -6993,22 +6993,22 @@ define amdgpu_kernel void @atomic_dec_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_dec_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -7017,12 +7017,12 @@ define amdgpu_kernel void @atomic_dec_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_dec_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -7042,7 +7042,7 @@ entry: define amdgpu_kernel void @atomic_load_f16_offset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_f16_offset: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -7058,7 +7058,7 @@ define amdgpu_kernel void @atomic_load_f16_offset(ptr addrspace(1) %in, ptr addr ; ; VI-LABEL: atomic_load_f16_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -7074,7 +7074,7 @@ define amdgpu_kernel void @atomic_load_f16_offset(ptr addrspace(1) %in, ptr addr ; ; GFX9-LABEL: atomic_load_f16_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] offset:16 glc @@ -7091,7 +7091,7 @@ define amdgpu_kernel void @atomic_load_f16_offset(ptr addrspace(1) %in, ptr addr define amdgpu_kernel void @atomic_load_f16_negoffset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_f16_negoffset: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, s2 @@ -7109,7 +7109,7 @@ define amdgpu_kernel void @atomic_load_f16_negoffset(ptr addrspace(1) %in, ptr a ; ; VI-LABEL: atomic_load_f16_negoffset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -7127,7 +7127,7 @@ define amdgpu_kernel void @atomic_load_f16_negoffset(ptr addrspace(1) %in, ptr a ; ; GFX9-LABEL: atomic_load_f16_negoffset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] offset:-512 glc @@ -7144,7 +7144,7 @@ define amdgpu_kernel void @atomic_load_f16_negoffset(ptr addrspace(1) %in, ptr a define amdgpu_kernel void @atomic_load_bf16_offset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_bf16_offset: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -7160,7 +7160,7 @@ define amdgpu_kernel void @atomic_load_bf16_offset(ptr addrspace(1) %in, ptr add ; ; VI-LABEL: atomic_load_bf16_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -7176,7 +7176,7 @@ define amdgpu_kernel void @atomic_load_bf16_offset(ptr addrspace(1) %in, ptr add ; ; GFX9-LABEL: atomic_load_bf16_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] offset:16 glc @@ -7193,7 +7193,7 @@ define amdgpu_kernel void @atomic_load_bf16_offset(ptr addrspace(1) %in, ptr add define amdgpu_kernel void @atomic_load_bf16_negoffset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_bf16_negoffset: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, s2 @@ -7211,7 +7211,7 @@ define amdgpu_kernel void @atomic_load_bf16_negoffset(ptr addrspace(1) %in, ptr ; ; VI-LABEL: atomic_load_bf16_negoffset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -7229,7 +7229,7 @@ define amdgpu_kernel void @atomic_load_bf16_negoffset(ptr addrspace(1) %in, ptr ; ; GFX9-LABEL: atomic_load_bf16_negoffset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] offset:-512 glc diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll index a7ba8a084272b..1fa7c52a68802 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll @@ -4616,7 +4616,7 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_offset_scalar(ptr addrspace(1) define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i32 %index) { ; SI-LABEL: atomic_max_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_ashr_i32 s5, s3, 31 ; SI-NEXT: s_mov_b32 s4, s3 @@ -4648,7 +4648,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_max_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s5, s3, 31 ; VI-NEXT: s_mov_b32 s4, s3 @@ -4679,28 +4679,28 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_max_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s5, s3, 31 -; GFX9-NEXT: s_mov_b32 s4, s3 -; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; GFX9-NEXT: s_add_u32 s0, s0, s4 -; GFX9-NEXT: s_addc_u32 s1, s1, s5 -; GFX9-NEXT: s_load_dword s3, s[0:1], 0x10 -; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: s_ashr_i32 s1, s7, 31 +; GFX9-NEXT: s_mov_b32 s0, s7 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: .LBB91_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_max_i32_e32 v0, s2, v1 +; GFX9-NEXT: v_max_i32_e32 v0, s6, v1 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB91_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm @@ -4714,8 +4714,8 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %index) { ; SI-LABEL: atomic_max_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_ashr_i32 s5, s9, 31 ; SI-NEXT: s_mov_b32 s4, s9 @@ -4753,8 +4753,8 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_max_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s7, s5, 31 ; VI-NEXT: s_mov_b32 s6, s5 @@ -4789,24 +4789,24 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_max_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s3, s1, 31 -; GFX9-NEXT: s_mov_b32 s2, s1 -; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 -; GFX9-NEXT: s_add_u32 s2, s4, s2 -; GFX9-NEXT: s_addc_u32 s3, s5, s3 -; GFX9-NEXT: s_load_dword s1, s[2:3], 0x10 +; GFX9-NEXT: s_ashr_i32 s1, s3, 31 +; GFX9-NEXT: s_mov_b32 s0, s3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_load_dword s3, s[0:1], 0x10 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: .LBB92_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: v_max_i32_e32 v2, s0, v3 -; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[2:3] offset:16 glc +; GFX9-NEXT: v_max_i32_e32 v2, s2, v3 +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 @@ -4829,7 +4829,7 @@ entry: define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, i32 %index) { ; SI-LABEL: atomic_max_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_ashr_i32 s5, s3, 31 ; SI-NEXT: s_mov_b32 s4, s3 @@ -4861,7 +4861,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; VI-LABEL: atomic_max_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s5, s3, 31 ; VI-NEXT: s_mov_b32 s4, s3 @@ -4890,28 +4890,28 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; GFX9-LABEL: atomic_max_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s5, s3, 31 -; GFX9-NEXT: s_mov_b32 s4, s3 -; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; GFX9-NEXT: s_add_u32 s0, s0, s4 -; GFX9-NEXT: s_addc_u32 s1, s1, s5 -; GFX9-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: s_ashr_i32 s1, s7, 31 +; GFX9-NEXT: s_mov_b32 s0, s7 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: .LBB93_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_max_i32_e32 v0, s2, v1 +; GFX9-NEXT: v_max_i32_e32 v0, s6, v1 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB93_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm @@ -4924,8 +4924,8 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %index) { ; SI-LABEL: atomic_max_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_ashr_i32 s5, s9, 31 ; SI-NEXT: s_mov_b32 s4, s9 @@ -4963,8 +4963,8 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_max_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s7, s5, 31 ; VI-NEXT: s_mov_b32 s6, s5 @@ -4997,24 +4997,24 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_max_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s3, s1, 31 -; GFX9-NEXT: s_mov_b32 s2, s1 -; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 -; GFX9-NEXT: s_add_u32 s2, s4, s2 -; GFX9-NEXT: s_addc_u32 s3, s5, s3 -; GFX9-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX9-NEXT: s_ashr_i32 s1, s3, 31 +; GFX9-NEXT: s_mov_b32 s0, s3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: .LBB94_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: v_max_i32_e32 v2, s0, v3 -; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[2:3] glc +; GFX9-NEXT: v_max_i32_e32 v2, s2, v3 +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 @@ -5869,7 +5869,7 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_offset_scalar(ptr addrspace(1) define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i32 %index) { ; SI-LABEL: atomic_umax_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_ashr_i32 s5, s3, 31 ; SI-NEXT: s_mov_b32 s4, s3 @@ -5901,7 +5901,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_umax_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s5, s3, 31 ; VI-NEXT: s_mov_b32 s4, s3 @@ -5932,28 +5932,28 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, ; ; GFX9-LABEL: atomic_umax_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s5, s3, 31 -; GFX9-NEXT: s_mov_b32 s4, s3 -; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; GFX9-NEXT: s_add_u32 s0, s0, s4 -; GFX9-NEXT: s_addc_u32 s1, s1, s5 -; GFX9-NEXT: s_load_dword s3, s[0:1], 0x10 -; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: s_ashr_i32 s1, s7, 31 +; GFX9-NEXT: s_mov_b32 s0, s7 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: .LBB105_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_max_u32_e32 v0, s2, v1 +; GFX9-NEXT: v_max_u32_e32 v0, s6, v1 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB105_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm @@ -5967,8 +5967,8 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %index) { ; SI-LABEL: atomic_umax_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_ashr_i32 s5, s9, 31 ; SI-NEXT: s_mov_b32 s4, s9 @@ -6006,8 +6006,8 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o ; ; VI-LABEL: atomic_umax_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s7, s5, 31 ; VI-NEXT: s_mov_b32 s6, s5 @@ -6042,24 +6042,24 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o ; ; GFX9-LABEL: atomic_umax_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s3, s1, 31 -; GFX9-NEXT: s_mov_b32 s2, s1 -; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 -; GFX9-NEXT: s_add_u32 s2, s4, s2 -; GFX9-NEXT: s_addc_u32 s3, s5, s3 -; GFX9-NEXT: s_load_dword s1, s[2:3], 0x10 +; GFX9-NEXT: s_ashr_i32 s1, s3, 31 +; GFX9-NEXT: s_mov_b32 s0, s3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_load_dword s3, s[0:1], 0x10 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: .LBB106_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: v_max_u32_e32 v2, s0, v3 -; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[2:3] offset:16 glc +; GFX9-NEXT: v_max_u32_e32 v2, s2, v3 +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 @@ -6082,8 +6082,8 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %index) { ; SI-LABEL: atomic_umax_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_ashr_i32 s5, s9, 31 ; SI-NEXT: s_mov_b32 s4, s9 @@ -6121,8 +6121,8 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_umax_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s7, s5, 31 ; VI-NEXT: s_mov_b32 s6, s5 @@ -6155,24 +6155,24 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_umax_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s3, s1, 31 -; GFX9-NEXT: s_mov_b32 s2, s1 -; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 -; GFX9-NEXT: s_add_u32 s2, s4, s2 -; GFX9-NEXT: s_addc_u32 s3, s5, s3 -; GFX9-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX9-NEXT: s_ashr_i32 s1, s3, 31 +; GFX9-NEXT: s_mov_b32 s0, s3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: .LBB107_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: v_max_u32_e32 v2, s0, v3 -; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[2:3] glc +; GFX9-NEXT: v_max_u32_e32 v2, s2, v3 +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 @@ -7860,7 +7860,7 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_offset_scalar(ptr addrspace(1) define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i32 %index) { ; SI-LABEL: atomic_min_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_ashr_i32 s5, s3, 31 ; SI-NEXT: s_mov_b32 s4, s3 @@ -7892,7 +7892,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_min_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s5, s3, 31 ; VI-NEXT: s_mov_b32 s4, s3 @@ -7923,28 +7923,28 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_min_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s5, s3, 31 -; GFX9-NEXT: s_mov_b32 s4, s3 -; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; GFX9-NEXT: s_add_u32 s0, s0, s4 -; GFX9-NEXT: s_addc_u32 s1, s1, s5 -; GFX9-NEXT: s_load_dword s3, s[0:1], 0x10 -; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: s_ashr_i32 s1, s7, 31 +; GFX9-NEXT: s_mov_b32 s0, s7 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: .LBB128_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_min_i32_e32 v0, s2, v1 +; GFX9-NEXT: v_min_i32_e32 v0, s6, v1 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB128_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm @@ -7958,8 +7958,8 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %index) { ; SI-LABEL: atomic_min_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_ashr_i32 s5, s9, 31 ; SI-NEXT: s_mov_b32 s4, s9 @@ -7997,8 +7997,8 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_min_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s7, s5, 31 ; VI-NEXT: s_mov_b32 s6, s5 @@ -8033,24 +8033,24 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_min_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s3, s1, 31 -; GFX9-NEXT: s_mov_b32 s2, s1 -; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 -; GFX9-NEXT: s_add_u32 s2, s4, s2 -; GFX9-NEXT: s_addc_u32 s3, s5, s3 -; GFX9-NEXT: s_load_dword s1, s[2:3], 0x10 +; GFX9-NEXT: s_ashr_i32 s1, s3, 31 +; GFX9-NEXT: s_mov_b32 s0, s3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_load_dword s3, s[0:1], 0x10 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: .LBB129_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: v_min_i32_e32 v2, s0, v3 -; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[2:3] offset:16 glc +; GFX9-NEXT: v_min_i32_e32 v2, s2, v3 +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 @@ -8073,36 +8073,36 @@ entry: define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_min_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s6, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dword s2, s[0:1], 0x0 -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dword s3, s[4:5], 0x0 +; SI-NEXT: s_mov_b64 s[0:1], 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s2 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: .LBB130_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_min_i32_e32 v0, s6, v1 +; SI-NEXT: v_min_i32_e32 v0, s2, v1 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v3, v1 ; SI-NEXT: v_mov_b32_e32 v2, v0 -; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; SI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; SI-NEXT: v_mov_b32_e32 v1, v2 -; SI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] ; SI-NEXT: s_cbranch_execnz .LBB130_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_min_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_mov_b64 s[0:1], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s3, s[4:5], 0x0 @@ -8126,24 +8126,24 @@ define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: atomic_min_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: .LBB130_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_min_i32_e32 v0, s4, v1 -; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_cbranch_execnz .LBB130_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm @@ -8155,8 +8155,8 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %index) { ; SI-LABEL: atomic_min_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_ashr_i32 s5, s9, 31 ; SI-NEXT: s_mov_b32 s4, s9 @@ -8194,8 +8194,8 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_min_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s7, s5, 31 ; VI-NEXT: s_mov_b32 s6, s5 @@ -8228,24 +8228,24 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_min_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s3, s1, 31 -; GFX9-NEXT: s_mov_b32 s2, s1 -; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 -; GFX9-NEXT: s_add_u32 s2, s4, s2 -; GFX9-NEXT: s_addc_u32 s3, s5, s3 -; GFX9-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX9-NEXT: s_ashr_i32 s1, s3, 31 +; GFX9-NEXT: s_mov_b32 s0, s3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: .LBB131_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: v_min_i32_e32 v2, s0, v3 -; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[2:3] glc +; GFX9-NEXT: v_min_i32_e32 v2, s2, v3 +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll index 3735c6349fbb3..e2d55990473c0 100644 --- a/llvm/test/CodeGen/AMDGPU/half.ll +++ b/llvm/test/CodeGen/AMDGPU/half.ll @@ -8,8 +8,8 @@ define amdgpu_kernel void @load_f16_arg(ptr addrspace(1) %out, half %arg) #0 { ; CI-LABEL: load_f16_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -19,8 +19,8 @@ define amdgpu_kernel void @load_f16_arg(ptr addrspace(1) %out, half %arg) #0 { ; ; VI-LABEL: load_f16_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -31,10 +31,10 @@ define amdgpu_kernel void @load_f16_arg(ptr addrspace(1) %out, half %arg) #0 { ; GFX11-LABEL: load_f16_arg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -46,8 +46,8 @@ define amdgpu_kernel void @load_f16_arg(ptr addrspace(1) %out, half %arg) #0 { define amdgpu_kernel void @load_v2f16_arg(ptr addrspace(1) %out, <2 x half> %arg) #0 { ; CI-LABEL: load_v2f16_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -57,8 +57,8 @@ define amdgpu_kernel void @load_v2f16_arg(ptr addrspace(1) %out, <2 x half> %arg ; ; VI-LABEL: load_v2f16_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -69,10 +69,10 @@ define amdgpu_kernel void @load_v2f16_arg(ptr addrspace(1) %out, <2 x half> %arg ; GFX11-LABEL: load_v2f16_arg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -84,7 +84,7 @@ define amdgpu_kernel void @load_v2f16_arg(ptr addrspace(1) %out, <2 x half> %arg define amdgpu_kernel void @load_v3f16_arg(ptr addrspace(1) %out, <3 x half> %arg) #0 { ; CIVI-LABEL: load_v3f16_arg: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: s_add_u32 s4, s0, 4 ; CIVI-NEXT: s_addc_u32 s5, s1, 0 @@ -100,7 +100,7 @@ define amdgpu_kernel void @load_v3f16_arg(ptr addrspace(1) %out, <3 x half> %arg ; ; GFX11-LABEL: load_v3f16_arg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: v_mov_b32_e32 v2, s2 @@ -119,7 +119,7 @@ define amdgpu_kernel void @load_v3f16_arg(ptr addrspace(1) %out, <3 x half> %arg define amdgpu_kernel void @load_v4f16_arg(ptr addrspace(1) %out, <4 x half> %arg) #0 { ; CIVI-LABEL: load_v4f16_arg: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s0 ; CIVI-NEXT: v_mov_b32_e32 v2, s2 @@ -130,7 +130,7 @@ define amdgpu_kernel void @load_v4f16_arg(ptr addrspace(1) %out, <4 x half> %arg ; ; GFX11-LABEL: load_v4f16_arg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -145,12 +145,12 @@ define amdgpu_kernel void @load_v4f16_arg(ptr addrspace(1) %out, <4 x half> %arg define amdgpu_kernel void @load_v8f16_arg(ptr addrspace(1) %out, <8 x half> %arg) #0 { ; CI-LABEL: load_v8f16_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v4, s4 +; CI-NEXT: v_mov_b32_e32 v4, s6 ; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v5, s5 +; CI-NEXT: v_mov_b32_e32 v5, s7 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: v_mov_b32_e32 v3, s3 @@ -159,12 +159,12 @@ define amdgpu_kernel void @load_v8f16_arg(ptr addrspace(1) %out, <8 x half> %arg ; ; VI-LABEL: load_v8f16_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v4, s6 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_mov_b32_e32 v5, s7 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 @@ -174,8 +174,8 @@ define amdgpu_kernel void @load_v8f16_arg(ptr addrspace(1) %out, <8 x half> %arg ; GFX11-LABEL: load_v8f16_arg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 @@ -191,8 +191,8 @@ define amdgpu_kernel void @load_v8f16_arg(ptr addrspace(1) %out, <8 x half> %arg define amdgpu_kernel void @extload_v2f16_arg(ptr addrspace(1) %out, <2 x half> %in) #0 { ; CI-LABEL: extload_v2f16_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s3, s2, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s3 @@ -204,8 +204,8 @@ define amdgpu_kernel void @extload_v2f16_arg(ptr addrspace(1) %out, <2 x half> % ; ; VI-LABEL: extload_v2f16_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v1, s3 @@ -218,13 +218,13 @@ define amdgpu_kernel void @extload_v2f16_arg(ptr addrspace(1) %out, <2 x half> % ; GFX11-LABEL: extload_v2f16_arg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s2, s4, 16 -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s4 -; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s2 +; GFX11-NEXT: s_lshr_b32 s3, s2, 16 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s2 +; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -237,8 +237,8 @@ define amdgpu_kernel void @extload_v2f16_arg(ptr addrspace(1) %out, <2 x half> % define amdgpu_kernel void @extload_f16_to_f32_arg(ptr addrspace(1) %out, half %arg) #0 { ; CI-LABEL: extload_f16_to_f32_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v2, s2 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -248,8 +248,8 @@ define amdgpu_kernel void @extload_f16_to_f32_arg(ptr addrspace(1) %out, half %a ; ; VI-LABEL: extload_f16_to_f32_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f32_f16_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -260,11 +260,11 @@ define amdgpu_kernel void @extload_f16_to_f32_arg(ptr addrspace(1) %out, half %a ; GFX11-LABEL: extload_f16_to_f32_arg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s4 +; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -277,8 +277,8 @@ define amdgpu_kernel void @extload_f16_to_f32_arg(ptr addrspace(1) %out, half %a define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(ptr addrspace(1) %out, <2 x half> %arg) #0 { ; CI-LABEL: extload_v2f16_to_v2f32_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s3, s2, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s3 @@ -290,8 +290,8 @@ define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(ptr addrspace(1) %out, <2 ; ; VI-LABEL: extload_v2f16_to_v2f32_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v1, s3 @@ -304,13 +304,13 @@ define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(ptr addrspace(1) %out, <2 ; GFX11-LABEL: extload_v2f16_to_v2f32_arg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s2, s4, 16 -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s4 -; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s2 +; GFX11-NEXT: s_lshr_b32 s3, s2, 16 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s2 +; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -323,7 +323,7 @@ define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(ptr addrspace(1) %out, <2 define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3 x half> %arg) #0 { ; CI-LABEL: extload_v3f16_to_v3f32_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s4, s2, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v2, s3 @@ -336,7 +336,7 @@ define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3 ; ; VI-LABEL: extload_v3f16_to_v3f32_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s4, s2, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s2 @@ -349,7 +349,7 @@ define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3 ; ; GFX11-LABEL: extload_v3f16_to_v3f32_arg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshr_b32 s4, s2, 16 @@ -368,7 +368,7 @@ define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3 define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4 x half> %arg) #0 { ; CI-LABEL: extload_v4f16_to_v4f32_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s4, s3, 16 ; CI-NEXT: s_lshr_b32 s5, s2, 16 @@ -383,7 +383,7 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4 ; ; VI-LABEL: extload_v4f16_to_v4f32_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s4, s3, 16 ; VI-NEXT: s_lshr_b32 s5, s2, 16 @@ -398,7 +398,7 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4 ; ; GFX11-LABEL: extload_v4f16_to_v4f32_arg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshr_b32 s4, s3, 16 @@ -419,8 +419,8 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4 define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(ptr addrspace(1) %out, <8 x half> %arg) #0 { ; CI-LABEL: extload_v8f16_to_v8f32_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s6, s1, 16 ; CI-NEXT: s_lshr_b32 s7, s0, 16 @@ -447,8 +447,8 @@ define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(ptr addrspace(1) %out, <8 ; ; VI-LABEL: extload_v8f16_to_v8f32_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s6, s1, 16 ; VI-NEXT: s_lshr_b32 s7, s0, 16 @@ -476,8 +476,8 @@ define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(ptr addrspace(1) %out, <8 ; GFX11-LABEL: extload_v8f16_to_v8f32_arg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v8, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshr_b32 s8, s7, 16 @@ -506,10 +506,10 @@ define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(ptr addrspace(1) %out, <8 define amdgpu_kernel void @extload_f16_to_f64_arg(ptr addrspace(1) %out, half %arg) #0 { ; CI-LABEL: extload_f16_to_f64_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[6:7], 0x2 +; CI-NEXT: s_load_dword s0, s[4:5], 0x2 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v3, s1 @@ -519,10 +519,10 @@ define amdgpu_kernel void @extload_f16_to_f64_arg(ptr addrspace(1) %out, half %a ; ; VI-LABEL: extload_f16_to_f64_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[6:7], 0x8 +; VI-NEXT: s_load_dword s0, s[4:5], 0x8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s1 @@ -532,14 +532,14 @@ define amdgpu_kernel void @extload_f16_to_f64_arg(ptr addrspace(1) %out, half %a ; ; GFX11-LABEL: extload_f16_to_f64_arg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x8 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -552,12 +552,12 @@ define amdgpu_kernel void @extload_f16_to_f64_arg(ptr addrspace(1) %out, half %a define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(ptr addrspace(1) %out, <2 x half> %arg) #0 { ; CI-LABEL: extload_v2f16_to_v2f64_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[6:7], 0x2 +; CI-NEXT: s_load_dword s0, s[4:5], 0x2 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s1, s0, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s1 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v0 ; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -568,12 +568,12 @@ define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(ptr addrspace(1) %out, <2 ; ; VI-LABEL: extload_v2f16_to_v2f64_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[6:7], 0x8 +; VI-NEXT: s_load_dword s0, s[4:5], 0x8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s1, s0, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s1 ; VI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v0 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -584,17 +584,17 @@ define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(ptr addrspace(1) %out, <2 ; ; GFX11-LABEL: extload_v2f16_to_v2f64_arg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x8 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s1, s0, 16 -; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_lshr_b32 s3, s2, 16 +; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s2 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v0 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -607,7 +607,7 @@ define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(ptr addrspace(1) %out, <2 define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3 x half> %arg) #0 { ; CI-LABEL: extload_v3f16_to_v3f64_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v0, s3 ; CI-NEXT: s_lshr_b32 s4, s2, 16 @@ -628,7 +628,7 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3 ; ; VI-LABEL: extload_v3f16_to_v3f64_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f32_f16_e32 v1, s3 ; VI-NEXT: s_lshr_b32 s4, s2, 16 @@ -649,7 +649,7 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3 ; ; GFX11-LABEL: extload_v3f16_to_v3f64_arg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshr_b32 s4, s2, 16 ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s3 @@ -675,7 +675,7 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3 define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4 x half> %arg) #0 { ; CI-LABEL: extload_v4f16_to_v4f64_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s4, s3, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s3 @@ -700,7 +700,7 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4 ; ; VI-LABEL: extload_v4f16_to_v4f64_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s5, s3, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s3 @@ -725,7 +725,7 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4 ; ; GFX11-LABEL: extload_v4f16_to_v4f64_arg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshr_b32 s5, s3, 16 ; GFX11-NEXT: s_lshr_b32 s4, s2, 16 @@ -754,8 +754,8 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4 define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(ptr addrspace(1) %out, <8 x half> %arg) #0 { ; CI-LABEL: extload_v8f16_to_v8f64_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s6, s3, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s6 @@ -801,8 +801,8 @@ define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(ptr addrspace(1) %out, <8 ; ; VI-LABEL: extload_v8f16_to_v8f64_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s6, s0, 16 ; VI-NEXT: s_lshr_b32 s8, s2, 16 @@ -848,20 +848,22 @@ define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(ptr addrspace(1) %out, <8 ; ; GFX11-LABEL: extload_v8f16_to_v8f64_arg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshr_b32 s9, s7, 16 ; GFX11-NEXT: s_lshr_b32 s8, s6, 16 -; GFX11-NEXT: s_lshr_b32 s1, s5, 16 +; GFX11-NEXT: s_lshr_b32 s3, s5, 16 ; GFX11-NEXT: v_cvt_f32_f16_e32 v6, s7 ; GFX11-NEXT: v_cvt_f32_f16_e32 v11, s9 -; GFX11-NEXT: s_lshr_b32 s0, s4, 16 +; GFX11-NEXT: s_lshr_b32 s2, s4, 16 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, s6 ; GFX11-NEXT: v_cvt_f32_f16_e32 v10, s8 ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, s5 -; GFX11-NEXT: v_cvt_f32_f16_e32 v7, s1 +; GFX11-NEXT: v_cvt_f32_f16_e32 v7, s3 ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s4 -; GFX11-NEXT: v_cvt_f32_f16_e32 v16, s0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v16, s2 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[12:13], v6 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[14:15], v11 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[8:9], v3 @@ -870,9 +872,7 @@ define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(ptr addrspace(1) %out, <8 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[6:7], v7 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v16 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v16, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: global_store_b128 v16, v[12:15], s[0:1] offset:48 ; GFX11-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:32 @@ -889,7 +889,7 @@ define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(ptr addrspace(1) %out, <8 define amdgpu_kernel void @global_load_store_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CIVI-LABEL: global_load_store_f16: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -902,7 +902,7 @@ define amdgpu_kernel void @global_load_store_f16(ptr addrspace(1) %out, ptr addr ; ; GFX11-LABEL: global_load_store_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] @@ -919,7 +919,7 @@ define amdgpu_kernel void @global_load_store_f16(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @global_load_store_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CIVI-LABEL: global_load_store_v2f16: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -932,7 +932,7 @@ define amdgpu_kernel void @global_load_store_v2f16(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: global_load_store_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -949,7 +949,7 @@ define amdgpu_kernel void @global_load_store_v2f16(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @global_load_store_v4f16(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 { ; CIVI-LABEL: global_load_store_v4f16: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s0 ; CIVI-NEXT: v_mov_b32_e32 v1, s1 @@ -962,7 +962,7 @@ define amdgpu_kernel void @global_load_store_v4f16(ptr addrspace(1) %in, ptr add ; ; GFX11-LABEL: global_load_store_v4f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -979,7 +979,7 @@ define amdgpu_kernel void @global_load_store_v4f16(ptr addrspace(1) %in, ptr add define amdgpu_kernel void @global_load_store_v8f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CIVI-LABEL: global_load_store_v8f16: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -992,7 +992,7 @@ define amdgpu_kernel void @global_load_store_v8f16(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: global_load_store_v8f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] @@ -1009,7 +1009,7 @@ define amdgpu_kernel void @global_load_store_v8f16(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @global_extload_f16_to_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CIVI-LABEL: global_extload_f16_to_f32: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -1023,7 +1023,7 @@ define amdgpu_kernel void @global_extload_f16_to_f32(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: global_extload_f16_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] @@ -1042,7 +1042,7 @@ define amdgpu_kernel void @global_extload_f16_to_f32(ptr addrspace(1) %out, ptr define amdgpu_kernel void @global_extload_v2f16_to_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_extload_v2f16_to_v2f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1058,7 +1058,7 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f32(ptr addrspace(1) %out, ; ; VI-LABEL: global_extload_v2f16_to_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1073,7 +1073,7 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f32(ptr addrspace(1) %out, ; ; GFX11-LABEL: global_extload_v2f16_to_v2f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v2, s[2:3] @@ -1095,7 +1095,7 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f32(ptr addrspace(1) %out, define amdgpu_kernel void @global_extload_v3f16_to_v3f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_extload_v3f16_to_v3f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1112,7 +1112,7 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f32(ptr addrspace(1) %out, ; ; VI-LABEL: global_extload_v3f16_to_v3f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1128,7 +1128,7 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f32(ptr addrspace(1) %out, ; ; GFX11-LABEL: global_extload_v3f16_to_v3f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v3, s[2:3] @@ -1151,7 +1151,7 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f32(ptr addrspace(1) %out, define amdgpu_kernel void @global_extload_v4f16_to_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_extload_v4f16_to_v4f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1170,7 +1170,7 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f32(ptr addrspace(1) %out, ; ; VI-LABEL: global_extload_v4f16_to_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1187,7 +1187,7 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f32(ptr addrspace(1) %out, ; ; GFX11-LABEL: global_extload_v4f16_to_v4f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v4, s[2:3] @@ -1212,7 +1212,7 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f32(ptr addrspace(1) %out, define amdgpu_kernel void @global_extload_v8f16_to_v8f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_extload_v8f16_to_v8f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1242,7 +1242,7 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f32(ptr addrspace(1) %out, ; ; VI-LABEL: global_extload_v8f16_to_v8f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1268,7 +1268,7 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f32(ptr addrspace(1) %out, ; ; GFX11-LABEL: global_extload_v8f16_to_v8f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v12, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v12, s[2:3] @@ -1300,7 +1300,7 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f32(ptr addrspace(1) %out, define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_extload_v16f16_to_v16f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s4, s2, 16 ; CI-NEXT: s_addc_u32 s5, s3, 0 @@ -1358,7 +1358,7 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out ; ; VI-LABEL: global_extload_v16f16_to_v16f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1408,7 +1408,7 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out ; ; GFX11-LABEL: global_extload_v16f16_to_v16f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v20, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -1457,7 +1457,7 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out define amdgpu_kernel void @global_extload_f16_to_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CIVI-LABEL: global_extload_f16_to_f64: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -1472,7 +1472,7 @@ define amdgpu_kernel void @global_extload_f16_to_f64(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: global_extload_f16_to_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v0, v2, s[2:3] @@ -1493,7 +1493,7 @@ define amdgpu_kernel void @global_extload_f16_to_f64(ptr addrspace(1) %out, ptr define amdgpu_kernel void @global_extload_v2f16_to_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_extload_v2f16_to_v2f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1511,7 +1511,7 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f64(ptr addrspace(1) %out, ; ; VI-LABEL: global_extload_v2f16_to_v2f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1528,7 +1528,7 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f64(ptr addrspace(1) %out, ; ; GFX11-LABEL: global_extload_v2f16_to_v2f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v4, s[2:3] @@ -1553,7 +1553,7 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f64(ptr addrspace(1) %out, define amdgpu_kernel void @global_extload_v3f16_to_v3f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_extload_v3f16_to_v3f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1578,7 +1578,7 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f64(ptr addrspace(1) %out, ; ; VI-LABEL: global_extload_v3f16_to_v3f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1602,7 +1602,7 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f64(ptr addrspace(1) %out, ; ; GFX11-LABEL: global_extload_v3f16_to_v3f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v6, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3] @@ -1631,7 +1631,7 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f64(ptr addrspace(1) %out, define amdgpu_kernel void @global_extload_v4f16_to_v4f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_extload_v4f16_to_v4f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1659,7 +1659,7 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f64(ptr addrspace(1) %out, ; ; VI-LABEL: global_extload_v4f16_to_v4f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1685,7 +1685,7 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f64(ptr addrspace(1) %out, ; ; GFX11-LABEL: global_extload_v4f16_to_v4f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v8, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v8, s[2:3] @@ -1718,7 +1718,7 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f64(ptr addrspace(1) %out, define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_extload_v8f16_to_v8f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1766,7 +1766,7 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out, ; ; VI-LABEL: global_extload_v8f16_to_v8f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1810,7 +1810,7 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out, ; ; GFX11-LABEL: global_extload_v8f16_to_v8f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v16, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v16, s[2:3] @@ -1852,7 +1852,7 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out, define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_extload_v16f16_to_v16f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1947,7 +1947,7 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out ; ; VI-LABEL: global_extload_v16f16_to_v16f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2033,7 +2033,7 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out ; ; GFX11-LABEL: global_extload_v16f16_to_v16f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -2102,7 +2102,7 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out define amdgpu_kernel void @global_truncstore_f32_to_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CIVI-LABEL: global_truncstore_f32_to_f16: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -2116,7 +2116,7 @@ define amdgpu_kernel void @global_truncstore_f32_to_f16(ptr addrspace(1) %out, p ; ; GFX11-LABEL: global_truncstore_f32_to_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2135,7 +2135,7 @@ define amdgpu_kernel void @global_truncstore_f32_to_f16(ptr addrspace(1) %out, p define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_truncstore_v2f32_to_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2152,7 +2152,7 @@ define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(ptr addrspace(1) %ou ; ; VI-LABEL: global_truncstore_v2f32_to_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2168,7 +2168,7 @@ define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(ptr addrspace(1) %ou ; ; GFX11-LABEL: global_truncstore_v2f32_to_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] @@ -2190,7 +2190,7 @@ define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(ptr addrspace(1) %ou define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_truncstore_v3f32_to_v3f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2213,7 +2213,7 @@ define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(ptr addrspace(1) %ou ; ; VI-LABEL: global_truncstore_v3f32_to_v3f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2235,7 +2235,7 @@ define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(ptr addrspace(1) %ou ; ; GFX11-LABEL: global_truncstore_v3f32_to_v3f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b96 v[0:2], v3, s[2:3] @@ -2260,7 +2260,7 @@ define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(ptr addrspace(1) %ou define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_truncstore_v4f32_to_v4f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2281,7 +2281,7 @@ define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(ptr addrspace(1) %ou ; ; VI-LABEL: global_truncstore_v4f32_to_v4f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2300,7 +2300,7 @@ define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(ptr addrspace(1) %ou ; ; GFX11-LABEL: global_truncstore_v4f32_to_v4f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] @@ -2325,7 +2325,7 @@ define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(ptr addrspace(1) %ou define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_truncstore_v8f32_to_v8f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2360,7 +2360,7 @@ define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(ptr addrspace(1) %ou ; ; VI-LABEL: global_truncstore_v8f32_to_v8f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2391,7 +2391,7 @@ define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(ptr addrspace(1) %ou ; ; GFX11-LABEL: global_truncstore_v8f32_to_v8f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v8, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -2425,7 +2425,7 @@ define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(ptr addrspace(1) %ou define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_truncstore_v16f32_to_v16f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s4, s2, 32 ; CI-NEXT: s_addc_u32 s5, s3, 0 @@ -2494,7 +2494,7 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) % ; ; VI-LABEL: global_truncstore_v16f32_to_v16f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 32 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -2554,7 +2554,7 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) % ; ; GFX11-LABEL: global_truncstore_v16f32_to_v16f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v16, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x3 @@ -2606,12 +2606,12 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) % define amdgpu_kernel void @fadd_f16(ptr addrspace(1) %out, half %a, half %b) #0 { ; CI-LABEL: fadd_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[6:7], 0x2 +; CI-NEXT: s_load_dword s0, s[4:5], 0x2 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 ; CI-NEXT: s_lshr_b32 s0, s0, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_add_f32_e32 v0, v0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2622,8 +2622,8 @@ define amdgpu_kernel void @fadd_f16(ptr addrspace(1) %out, half %a, half %b) #0 ; ; VI-LABEL: fadd_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s3 @@ -2636,13 +2636,13 @@ define amdgpu_kernel void @fadd_f16(ptr addrspace(1) %out, half %a, half %b) #0 ; GFX11-LABEL: fadd_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s2, s4, 16 +; GFX11-NEXT: s_lshr_b32 s3, s2, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_add_f16_e64 v1, s4, s2 +; GFX11-NEXT: v_add_f16_e64 v1, s2, s3 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -2655,7 +2655,7 @@ define amdgpu_kernel void @fadd_f16(ptr addrspace(1) %out, half %a, half %b) #0 define amdgpu_kernel void @fadd_v2f16(ptr addrspace(1) %out, <2 x half> %a, <2 x half> %b) #0 { ; CI-LABEL: fadd_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s4, s2, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 @@ -2676,7 +2676,7 @@ define amdgpu_kernel void @fadd_v2f16(ptr addrspace(1) %out, <2 x half> %a, <2 x ; ; VI-LABEL: fadd_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s4, s3, 16 ; VI-NEXT: s_lshr_b32 s5, s2, 16 @@ -2693,7 +2693,7 @@ define amdgpu_kernel void @fadd_v2f16(ptr addrspace(1) %out, <2 x half> %a, <2 x ; ; GFX11-LABEL: fadd_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v1, s2, s3 @@ -2709,7 +2709,7 @@ define amdgpu_kernel void @fadd_v2f16(ptr addrspace(1) %out, <2 x half> %a, <2 x define amdgpu_kernel void @fadd_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: fadd_v4f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2746,7 +2746,7 @@ define amdgpu_kernel void @fadd_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: fadd_v4f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2765,7 +2765,7 @@ define amdgpu_kernel void @fadd_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX11-LABEL: fadd_v4f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] @@ -2787,8 +2787,8 @@ define amdgpu_kernel void @fadd_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @fadd_v8f16(ptr addrspace(1) %out, <8 x half> %a, <8 x half> %b) #0 { ; CI-LABEL: fadd_v8f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x4 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x4 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s2, s8, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 @@ -2845,8 +2845,8 @@ define amdgpu_kernel void @fadd_v8f16(ptr addrspace(1) %out, <8 x half> %a, <8 x ; ; VI-LABEL: fadd_v8f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s2, s15, 16 ; VI-NEXT: s_lshr_b32 s3, s11, 16 @@ -2888,8 +2888,8 @@ define amdgpu_kernel void @fadd_v8f16(ptr addrspace(1) %out, <8 x half> %a, <8 x ; GFX11-LABEL: fadd_v8f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x10 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x10 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v3, s7, s11 @@ -2908,7 +2908,7 @@ define amdgpu_kernel void @fadd_v8f16(ptr addrspace(1) %out, <8 x half> %a, <8 x define amdgpu_kernel void @test_bitcast_from_half(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 { ; CIVI-LABEL: test_bitcast_from_half: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s0 ; CIVI-NEXT: v_mov_b32_e32 v1, s1 @@ -2921,7 +2921,7 @@ define amdgpu_kernel void @test_bitcast_from_half(ptr addrspace(1) %in, ptr addr ; ; GFX11-LABEL: test_bitcast_from_half: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[0:1] @@ -2939,7 +2939,7 @@ define amdgpu_kernel void @test_bitcast_from_half(ptr addrspace(1) %in, ptr addr define amdgpu_kernel void @test_bitcast_to_half(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CIVI-LABEL: test_bitcast_to_half: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -2952,7 +2952,7 @@ define amdgpu_kernel void @test_bitcast_to_half(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: test_bitcast_to_half: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll index b62bf890e65fe..f736ca7cd625a 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll @@ -4,22 +4,22 @@ define amdgpu_kernel void @float4_inselt(ptr addrspace(1) %out, <4 x float> %vec, i32 %sel) { ; GCN-LABEL: float4_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s8, s[2:3], 0x44 -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s2, s[0:1], 0x44 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s8, 3 +; GCN-NEXT: s_cmp_lg_u32 s2, 3 ; GCN-NEXT: v_mov_b32_e32 v0, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s8, 2 +; GCN-NEXT: s_cmp_lg_u32 s2, 2 ; GCN-NEXT: v_cndmask_b32_e32 v3, 1.0, v0, vcc ; GCN-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s8, 1 +; GCN-NEXT: s_cmp_lg_u32 s2, 1 ; GCN-NEXT: v_cndmask_b32_e32 v2, 1.0, v0, vcc ; GCN-NEXT: v_mov_b32_e32 v0, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s8, 0 +; GCN-NEXT: s_cmp_lg_u32 s2, 0 ; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v0, vcc ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -37,7 +37,7 @@ entry: define amdgpu_kernel void @float4_inselt_undef(ptr addrspace(1) %out, i32 %sel) { ; GCN-LABEL: float4_inselt_undef: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 1.0 ; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: v_mov_b32_e32 v2, v0 @@ -56,23 +56,23 @@ entry: define amdgpu_kernel void @int4_inselt(ptr addrspace(1) %out, <4 x i32> %vec, i32 %sel) { ; GCN-LABEL: int4_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s8, s[2:3], 0x44 -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s2, s[0:1], 0x44 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s8, 3 -; GCN-NEXT: s_cselect_b32 s2, s7, 1 -; GCN-NEXT: s_cmp_lg_u32 s8, 2 -; GCN-NEXT: s_cselect_b32 s3, s6, 1 -; GCN-NEXT: s_cmp_lg_u32 s8, 1 +; GCN-NEXT: s_cmp_lg_u32 s2, 3 +; GCN-NEXT: s_cselect_b32 s3, s7, 1 +; GCN-NEXT: s_cmp_lg_u32 s2, 2 +; GCN-NEXT: s_cselect_b32 s6, s6, 1 +; GCN-NEXT: s_cmp_lg_u32 s2, 1 ; GCN-NEXT: s_cselect_b32 s5, s5, 1 -; GCN-NEXT: s_cmp_lg_u32 s8, 0 -; GCN-NEXT: s_cselect_b32 s4, s4, 1 +; GCN-NEXT: s_cmp_lg_u32 s2, 0 +; GCN-NEXT: s_cselect_b32 s2, s4, 1 ; GCN-NEXT: v_mov_b32_e32 v5, s1 -; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: v_mov_b32_e32 v2, s3 -; GCN-NEXT: v_mov_b32_e32 v3, s2 +; GCN-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm @@ -85,15 +85,15 @@ entry: define amdgpu_kernel void @float2_inselt(ptr addrspace(1) %out, <2 x float> %vec, i32 %sel) { ; GCN-LABEL: float2_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x34 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s6, 1 -; GCN-NEXT: v_mov_b32_e32 v0, s5 +; GCN-NEXT: s_cmp_lg_u32 s4, 1 +; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s6, 0 +; GCN-NEXT: s_cmp_lg_u32 s4, 0 ; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_mov_b32_e32 v3, s1 ; GCN-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc @@ -109,21 +109,21 @@ entry: define amdgpu_kernel void @float8_inselt(ptr addrspace(1) %out, <8 x float> %vec, i32 %sel) { ; GCN-LABEL: float8_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 -; GCN-NEXT: s_load_dword s12, s[2:3], 0x64 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 +; GCN-NEXT: s_load_dword s2, s[0:1], 0x64 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: s_mov_b32 m0, s2 ; GCN-NEXT: s_add_u32 s2, s0, 16 ; GCN-NEXT: s_addc_u32 s3, s1, 0 +; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: v_mov_b32_e32 v3, s7 ; GCN-NEXT: v_mov_b32_e32 v4, s8 ; GCN-NEXT: v_mov_b32_e32 v5, s9 ; GCN-NEXT: v_mov_b32_e32 v6, s10 ; GCN-NEXT: v_mov_b32_e32 v7, s11 -; GCN-NEXT: s_mov_b32 m0, s12 ; GCN-NEXT: v_mov_b32_e32 v9, s3 ; GCN-NEXT: v_movreld_b32_e32 v0, 1.0 ; GCN-NEXT: v_mov_b32_e32 v8, s2 @@ -142,14 +142,14 @@ entry: define amdgpu_kernel void @float16_inselt(ptr addrspace(1) %out, <16 x float> %vec, i32 %sel) { ; GCN-LABEL: float16_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_load_dword s20, s[2:3], 0xa4 +; GCN-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s20, s[0:1], 0xa4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: s_add_u32 s2, s0, 48 -; GCN-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NEXT: v_mov_b32_e32 v17, s3 +; GCN-NEXT: s_add_u32 s0, s2, 48 +; GCN-NEXT: s_addc_u32 s1, s3, 0 +; GCN-NEXT: v_mov_b32_e32 v17, s1 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: v_mov_b32_e32 v3, s7 @@ -166,24 +166,24 @@ define amdgpu_kernel void @float16_inselt(ptr addrspace(1) %out, <16 x float> %v ; GCN-NEXT: v_mov_b32_e32 v14, s18 ; GCN-NEXT: v_mov_b32_e32 v15, s19 ; GCN-NEXT: s_mov_b32 m0, s20 -; GCN-NEXT: v_mov_b32_e32 v16, s2 -; GCN-NEXT: s_add_u32 s2, s0, 32 +; GCN-NEXT: v_mov_b32_e32 v16, s0 +; GCN-NEXT: s_add_u32 s0, s2, 32 ; GCN-NEXT: v_movreld_b32_e32 v0, 1.0 -; GCN-NEXT: s_addc_u32 s3, s1, 0 +; GCN-NEXT: s_addc_u32 s1, s3, 0 ; GCN-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v13, s3 -; GCN-NEXT: v_mov_b32_e32 v12, s2 -; GCN-NEXT: s_add_u32 s2, s0, 16 -; GCN-NEXT: s_addc_u32 s3, s1, 0 +; GCN-NEXT: v_mov_b32_e32 v13, s1 +; GCN-NEXT: v_mov_b32_e32 v12, s0 +; GCN-NEXT: s_add_u32 s0, s2, 16 +; GCN-NEXT: s_addc_u32 s1, s3, 0 ; GCN-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v9, s3 -; GCN-NEXT: v_mov_b32_e32 v8, s2 +; GCN-NEXT: v_mov_b32_e32 v9, s1 +; GCN-NEXT: v_mov_b32_e32 v8, s0 ; GCN-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v5, s1 -; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: v_mov_b32_e32 v5, s3 +; GCN-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm entry: @@ -195,18 +195,18 @@ entry: define amdgpu_kernel void @float32_inselt(ptr addrspace(1) %out, <32 x float> %vec, i32 %sel) { ; GCN-LABEL: float32_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0xa4 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0xe4 -; GCN-NEXT: s_load_dword s2, s[2:3], 0x124 +; GCN-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0xa4 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0xe4 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x124 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s36 ; GCN-NEXT: v_mov_b32_e32 v1, s37 ; GCN-NEXT: v_mov_b32_e32 v2, s38 -; GCN-NEXT: s_mov_b32 m0, s2 -; GCN-NEXT: s_add_u32 s2, s0, 0x70 -; GCN-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NEXT: v_mov_b32_e32 v33, s3 +; GCN-NEXT: s_mov_b32 m0, s0 +; GCN-NEXT: s_add_u32 s0, s2, 0x70 +; GCN-NEXT: s_addc_u32 s1, s3, 0 +; GCN-NEXT: v_mov_b32_e32 v33, s1 ; GCN-NEXT: v_mov_b32_e32 v3, s39 ; GCN-NEXT: v_mov_b32_e32 v4, s40 ; GCN-NEXT: v_mov_b32_e32 v5, s41 @@ -236,48 +236,48 @@ define amdgpu_kernel void @float32_inselt(ptr addrspace(1) %out, <32 x float> %v ; GCN-NEXT: v_mov_b32_e32 v29, s17 ; GCN-NEXT: v_mov_b32_e32 v30, s18 ; GCN-NEXT: v_mov_b32_e32 v31, s19 -; GCN-NEXT: v_mov_b32_e32 v32, s2 -; GCN-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-NEXT: v_mov_b32_e32 v32, s0 +; GCN-NEXT: s_add_u32 s0, s2, 0x60 ; GCN-NEXT: v_movreld_b32_e32 v0, 1.0 -; GCN-NEXT: s_addc_u32 s3, s1, 0 +; GCN-NEXT: s_addc_u32 s1, s3, 0 ; GCN-NEXT: flat_store_dwordx4 v[32:33], v[28:31] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v29, s3 -; GCN-NEXT: v_mov_b32_e32 v28, s2 -; GCN-NEXT: s_add_u32 s2, s0, 0x50 -; GCN-NEXT: s_addc_u32 s3, s1, 0 +; GCN-NEXT: v_mov_b32_e32 v29, s1 +; GCN-NEXT: v_mov_b32_e32 v28, s0 +; GCN-NEXT: s_add_u32 s0, s2, 0x50 +; GCN-NEXT: s_addc_u32 s1, s3, 0 ; GCN-NEXT: flat_store_dwordx4 v[28:29], v[24:27] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v25, s3 -; GCN-NEXT: v_mov_b32_e32 v24, s2 -; GCN-NEXT: s_add_u32 s2, s0, 64 -; GCN-NEXT: s_addc_u32 s3, s1, 0 +; GCN-NEXT: v_mov_b32_e32 v25, s1 +; GCN-NEXT: v_mov_b32_e32 v24, s0 +; GCN-NEXT: s_add_u32 s0, s2, 64 +; GCN-NEXT: s_addc_u32 s1, s3, 0 ; GCN-NEXT: flat_store_dwordx4 v[24:25], v[20:23] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v21, s3 -; GCN-NEXT: v_mov_b32_e32 v20, s2 -; GCN-NEXT: s_add_u32 s2, s0, 48 -; GCN-NEXT: s_addc_u32 s3, s1, 0 +; GCN-NEXT: v_mov_b32_e32 v21, s1 +; GCN-NEXT: v_mov_b32_e32 v20, s0 +; GCN-NEXT: s_add_u32 s0, s2, 48 +; GCN-NEXT: s_addc_u32 s1, s3, 0 ; GCN-NEXT: flat_store_dwordx4 v[20:21], v[16:19] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v17, s3 -; GCN-NEXT: v_mov_b32_e32 v16, s2 -; GCN-NEXT: s_add_u32 s2, s0, 32 -; GCN-NEXT: s_addc_u32 s3, s1, 0 +; GCN-NEXT: v_mov_b32_e32 v17, s1 +; GCN-NEXT: v_mov_b32_e32 v16, s0 +; GCN-NEXT: s_add_u32 s0, s2, 32 +; GCN-NEXT: s_addc_u32 s1, s3, 0 ; GCN-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v13, s3 -; GCN-NEXT: v_mov_b32_e32 v12, s2 -; GCN-NEXT: s_add_u32 s2, s0, 16 -; GCN-NEXT: s_addc_u32 s3, s1, 0 +; GCN-NEXT: v_mov_b32_e32 v13, s1 +; GCN-NEXT: v_mov_b32_e32 v12, s0 +; GCN-NEXT: s_add_u32 s0, s2, 16 +; GCN-NEXT: s_addc_u32 s1, s3, 0 ; GCN-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v9, s3 -; GCN-NEXT: v_mov_b32_e32 v8, s2 +; GCN-NEXT: v_mov_b32_e32 v9, s1 +; GCN-NEXT: v_mov_b32_e32 v8, s0 ; GCN-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v5, s1 -; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: v_mov_b32_e32 v5, s3 +; GCN-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm entry: @@ -289,8 +289,8 @@ entry: define amdgpu_kernel void @half4_inselt(ptr addrspace(1) %out, <4 x half> %vec, i32 %sel) { ; GCN-LABEL: half4_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s6, s[2:3], 0x34 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s6, s[0:1], 0x34 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s4, 0x3c003c00 ; GCN-NEXT: s_mov_b32 s5, s4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -314,7 +314,7 @@ entry: define amdgpu_kernel void @half2_inselt(ptr addrspace(1) %out, <2 x half> %vec, i32 %sel) { ; GCN-LABEL: half2_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshl_b32 s3, s3, 4 ; GCN-NEXT: s_lshl_b32 s3, 0xffff, s3 @@ -335,49 +335,49 @@ entry: define amdgpu_kernel void @half8_inselt(ptr addrspace(1) %out, <8 x half> %vec, i32 %sel) { ; GCN-LABEL: half8_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GCN-NEXT: s_load_dword s8, s[2:3], 0x44 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GCN-NEXT: s_load_dword s2, s[0:1], 0x44 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 0x3c00 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s2, s7, 16 -; GCN-NEXT: s_cmp_lg_u32 s8, 7 -; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: s_lshr_b32 s3, s7, 16 +; GCN-NEXT: s_cmp_lg_u32 s2, 7 +; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s8, 6 +; GCN-NEXT: s_cmp_lg_u32 s2, 6 ; GCN-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc ; GCN-NEXT: v_mov_b32_e32 v2, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_lshr_b32 s2, s6, 16 +; GCN-NEXT: s_lshr_b32 s3, s6, 16 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc -; GCN-NEXT: s_cmp_lg_u32 s8, 5 +; GCN-NEXT: s_cmp_lg_u32 s2, 5 ; GCN-NEXT: v_or_b32_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s8, 4 +; GCN-NEXT: s_cmp_lg_u32 s2, 4 ; GCN-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc ; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_lshr_b32 s2, s5, 16 +; GCN-NEXT: s_lshr_b32 s3, s5, 16 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc -; GCN-NEXT: s_cmp_lg_u32 s8, 3 +; GCN-NEXT: s_cmp_lg_u32 s2, 3 ; GCN-NEXT: v_or_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s8, 2 +; GCN-NEXT: s_cmp_lg_u32 s2, 2 ; GCN-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc ; GCN-NEXT: v_mov_b32_e32 v4, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_lshr_b32 s2, s4, 16 +; GCN-NEXT: s_lshr_b32 s3, s4, 16 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc -; GCN-NEXT: s_cmp_lg_u32 s8, 1 +; GCN-NEXT: s_cmp_lg_u32 s2, 1 ; GCN-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v4, s2 +; GCN-NEXT: v_mov_b32_e32 v4, s3 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s8, 0 +; GCN-NEXT: s_cmp_lg_u32 s2, 0 ; GCN-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc ; GCN-NEXT: v_mov_b32_e32 v5, s4 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -397,7 +397,7 @@ entry: define amdgpu_kernel void @short2_inselt(ptr addrspace(1) %out, <2 x i16> %vec, i32 %sel) { ; GCN-LABEL: short2_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshl_b32 s3, s3, 4 ; GCN-NEXT: s_lshl_b32 s3, 0xffff, s3 @@ -418,8 +418,8 @@ entry: define amdgpu_kernel void @short4_inselt(ptr addrspace(1) %out, <4 x i16> %vec, i32 %sel) { ; GCN-LABEL: short4_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s6, s[2:3], 0x34 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s6, s[0:1], 0x34 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s4, 0x10001 ; GCN-NEXT: s_mov_b32 s5, s4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -443,8 +443,8 @@ entry: define amdgpu_kernel void @byte8_inselt(ptr addrspace(1) %out, <8 x i8> %vec, i32 %sel) { ; GCN-LABEL: byte8_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s4, s[2:3], 0x34 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x34 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshl_b32 s4, s4, 3 ; GCN-NEXT: s_lshl_b64 s[4:5], 0xff, s4 @@ -467,99 +467,99 @@ entry: define amdgpu_kernel void @byte16_inselt(ptr addrspace(1) %out, <16 x i8> %vec, i32 %sel) { ; GCN-LABEL: byte16_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GCN-NEXT: s_load_dword s8, s[2:3], 0x44 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GCN-NEXT: s_load_dword s2, s[0:1], 0x44 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s2, s7, 24 -; GCN-NEXT: s_cmp_lg_u32 s8, 15 -; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: s_lshr_b32 s3, s7, 24 +; GCN-NEXT: s_cmp_lg_u32 s2, 15 +; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_lshr_b32 s2, s7, 16 -; GCN-NEXT: s_cmp_lg_u32 s8, 14 +; GCN-NEXT: s_lshr_b32 s3, s7, 16 +; GCN-NEXT: s_cmp_lg_u32 s2, 14 ; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_lshr_b32 s2, s7, 8 +; GCN-NEXT: s_lshr_b32 s3, s7, 8 ; GCN-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GCN-NEXT: v_cndmask_b32_e32 v1, 1, v1, vcc -; GCN-NEXT: s_cmp_lg_u32 s8, 13 +; GCN-NEXT: s_cmp_lg_u32 s2, 13 ; GCN-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s8, 12 +; GCN-NEXT: s_cmp_lg_u32 s2, 12 ; GCN-NEXT: v_cndmask_b32_e32 v1, 1, v1, vcc ; GCN-NEXT: v_mov_b32_e32 v2, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GCN-NEXT: v_cndmask_b32_e32 v2, 1, v2, vcc -; GCN-NEXT: s_lshr_b32 s2, s6, 24 +; GCN-NEXT: s_lshr_b32 s3, s6, 24 ; GCN-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NEXT: s_cmp_lg_u32 s8, 11 +; GCN-NEXT: s_cmp_lg_u32 s2, 11 ; GCN-NEXT: v_or_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_lshr_b32 s2, s6, 16 -; GCN-NEXT: s_cmp_lg_u32 s8, 10 +; GCN-NEXT: s_lshr_b32 s3, s6, 16 +; GCN-NEXT: s_cmp_lg_u32 s2, 10 ; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_lshr_b32 s2, s6, 8 +; GCN-NEXT: s_lshr_b32 s3, s6, 8 ; GCN-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GCN-NEXT: v_cndmask_b32_e32 v1, 1, v1, vcc -; GCN-NEXT: s_cmp_lg_u32 s8, 9 +; GCN-NEXT: s_cmp_lg_u32 s2, 9 ; GCN-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s8, 8 +; GCN-NEXT: s_cmp_lg_u32 s2, 8 ; GCN-NEXT: v_cndmask_b32_e32 v1, 1, v1, vcc ; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GCN-NEXT: v_cndmask_b32_e32 v2, 1, v2, vcc -; GCN-NEXT: s_lshr_b32 s2, s5, 24 +; GCN-NEXT: s_lshr_b32 s3, s5, 24 ; GCN-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NEXT: s_cmp_lg_u32 s8, 7 +; GCN-NEXT: s_cmp_lg_u32 s2, 7 ; GCN-NEXT: v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_lshr_b32 s2, s5, 16 -; GCN-NEXT: s_cmp_lg_u32 s8, 6 +; GCN-NEXT: s_lshr_b32 s3, s5, 16 +; GCN-NEXT: s_cmp_lg_u32 s2, 6 ; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_lshr_b32 s2, s5, 8 +; GCN-NEXT: s_lshr_b32 s3, s5, 8 ; GCN-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GCN-NEXT: v_cndmask_b32_e32 v1, 1, v1, vcc -; GCN-NEXT: s_cmp_lg_u32 s8, 5 +; GCN-NEXT: s_cmp_lg_u32 s2, 5 ; GCN-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s8, 4 +; GCN-NEXT: s_cmp_lg_u32 s2, 4 ; GCN-NEXT: v_cndmask_b32_e32 v1, 1, v1, vcc ; GCN-NEXT: v_mov_b32_e32 v4, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GCN-NEXT: v_cndmask_b32_e32 v4, 1, v4, vcc -; GCN-NEXT: s_lshr_b32 s2, s4, 24 +; GCN-NEXT: s_lshr_b32 s3, s4, 24 ; GCN-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NEXT: s_cmp_lg_u32 s8, 3 +; GCN-NEXT: s_cmp_lg_u32 s2, 3 ; GCN-NEXT: v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_lshr_b32 s2, s4, 16 -; GCN-NEXT: s_cmp_lg_u32 s8, 2 +; GCN-NEXT: s_lshr_b32 s3, s4, 16 +; GCN-NEXT: s_cmp_lg_u32 s2, 2 ; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v4, s2 +; GCN-NEXT: v_mov_b32_e32 v4, s3 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_lshr_b32 s2, s4, 8 +; GCN-NEXT: s_lshr_b32 s3, s4, 8 ; GCN-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GCN-NEXT: v_cndmask_b32_e32 v4, 1, v4, vcc -; GCN-NEXT: s_cmp_lg_u32 s8, 1 +; GCN-NEXT: s_cmp_lg_u32 s2, 1 ; GCN-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v4, s2 +; GCN-NEXT: v_mov_b32_e32 v4, s3 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s8, 0 +; GCN-NEXT: s_cmp_lg_u32 s2, 0 ; GCN-NEXT: v_cndmask_b32_e32 v4, 1, v4, vcc ; GCN-NEXT: v_mov_b32_e32 v5, s4 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -580,21 +580,21 @@ entry: define amdgpu_kernel void @double2_inselt(ptr addrspace(1) %out, <2 x double> %vec, i32 %sel) { ; GCN-LABEL: double2_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s8, s[2:3], 0x44 -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s2, s[0:1], 0x44 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_eq_u32 s8, 1 -; GCN-NEXT: s_cselect_b32 s2, 0x3ff00000, s7 -; GCN-NEXT: s_cselect_b32 s3, 0, s6 -; GCN-NEXT: s_cmp_eq_u32 s8, 0 -; GCN-NEXT: s_cselect_b32 s5, 0x3ff00000, s5 +; GCN-NEXT: s_cmp_eq_u32 s2, 1 +; GCN-NEXT: s_cselect_b32 s3, 0x3ff00000, s7 +; GCN-NEXT: s_cselect_b32 s6, 0, s6 +; GCN-NEXT: s_cmp_eq_u32 s2, 0 +; GCN-NEXT: s_cselect_b32 s2, 0x3ff00000, s5 ; GCN-NEXT: s_cselect_b32 s4, 0, s4 ; GCN-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: v_mov_b32_e32 v2, s3 -; GCN-NEXT: v_mov_b32_e32 v3, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm @@ -607,10 +607,10 @@ entry: define amdgpu_kernel void @double5_inselt(ptr addrspace(1) %out, <5 x double> %vec, i32 %sel) { ; GCN-LABEL: double5_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s12, s[2:3], 0xa4 -; GCN-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x84 -; GCN-NEXT: s_load_dwordx2 s[10:11], s[2:3], 0x24 -; GCN-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x64 +; GCN-NEXT: s_load_dword s12, s[0:1], 0xa4 +; GCN-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x84 +; GCN-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x64 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_eq_u32 s12, 4 ; GCN-NEXT: s_cselect_b32 s9, 0x3ff00000, s9 @@ -661,12 +661,12 @@ entry: define amdgpu_kernel void @double8_inselt(ptr addrspace(1) %out, <8 x double> %vec, i32 %sel) { ; GCN-LABEL: double8_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s20, s[2:3], 0xa4 -; GCN-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s2, s[0:1], 0xa4 +; GCN-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v16, 0x3ff00000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s2, s20, 1 +; GCN-NEXT: s_lshl_b32 s2, s2, 1 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: v_mov_b32_e32 v2, s6 @@ -717,17 +717,17 @@ entry: define amdgpu_kernel void @double7_inselt(ptr addrspace(1) %out, <7 x double> %vec, i32 %sel) { ; GCN-LABEL: double7_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x64 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x94 -; GCN-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x84 -; GCN-NEXT: s_load_dword s2, s[2:3], 0xa4 +; GCN-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x64 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x94 +; GCN-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x84 +; GCN-NEXT: s_load_dword s0, s[0:1], 0xa4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: v_mov_b32_e32 v3, s7 -; GCN-NEXT: s_lshl_b32 s2, s2, 1 +; GCN-NEXT: s_lshl_b32 s0, s0, 1 ; GCN-NEXT: v_mov_b32_e32 v4, s8 ; GCN-NEXT: v_mov_b32_e32 v5, s9 ; GCN-NEXT: v_mov_b32_e32 v6, s10 @@ -738,25 +738,25 @@ define amdgpu_kernel void @double7_inselt(ptr addrspace(1) %out, <7 x double> %v ; GCN-NEXT: v_mov_b32_e32 v11, s15 ; GCN-NEXT: v_mov_b32_e32 v12, s16 ; GCN-NEXT: v_mov_b32_e32 v13, s17 -; GCN-NEXT: s_mov_b32 m0, s2 +; GCN-NEXT: s_mov_b32 m0, s0 ; GCN-NEXT: v_movreld_b32_e32 v0, 0 ; GCN-NEXT: v_mov_b32_e32 v16, 0x3ff00000 -; GCN-NEXT: s_add_u32 s2, s0, 16 +; GCN-NEXT: s_add_u32 s0, s2, 16 ; GCN-NEXT: v_movreld_b32_e32 v1, v16 -; GCN-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NEXT: v_mov_b32_e32 v15, s3 -; GCN-NEXT: v_mov_b32_e32 v14, s2 +; GCN-NEXT: s_addc_u32 s1, s3, 0 +; GCN-NEXT: v_mov_b32_e32 v15, s1 +; GCN-NEXT: v_mov_b32_e32 v14, s0 ; GCN-NEXT: flat_store_dwordx4 v[14:15], v[4:7] -; GCN-NEXT: s_add_u32 s2, s0, 48 -; GCN-NEXT: v_mov_b32_e32 v5, s1 -; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: s_add_u32 s0, s2, 48 +; GCN-NEXT: v_mov_b32_e32 v5, s3 +; GCN-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_add_u32 s0, s0, 32 +; GCN-NEXT: s_addc_u32 s1, s3, 0 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: s_add_u32 s0, s2, 32 ; GCN-NEXT: flat_store_dwordx2 v[0:1], v[12:13] -; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_addc_u32 s1, s3, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: flat_store_dwordx4 v[0:1], v[8:11] @@ -770,15 +770,14 @@ entry: define amdgpu_kernel void @double16_inselt(ptr addrspace(1) %out, <16 x double> %vec, i32 %sel) { ; GCN-LABEL: double16_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s0, s[2:3], 0x124 -; GCN-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0xa4 -; GCN-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0xe4 +; GCN-NEXT: s_load_dword s2, s[0:1], 0x124 +; GCN-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0xa4 +; GCN-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0xe4 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v32, 0x3ff00000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s36 -; GCN-NEXT: s_lshl_b32 s0, s0, 1 -; GCN-NEXT: s_mov_b32 m0, s0 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_lshl_b32 s2, s2, 1 ; GCN-NEXT: v_mov_b32_e32 v1, s37 ; GCN-NEXT: v_mov_b32_e32 v2, s38 ; GCN-NEXT: v_mov_b32_e32 v3, s39 @@ -810,7 +809,7 @@ define amdgpu_kernel void @double16_inselt(ptr addrspace(1) %out, <16 x double> ; GCN-NEXT: v_mov_b32_e32 v29, s17 ; GCN-NEXT: v_mov_b32_e32 v30, s18 ; GCN-NEXT: v_mov_b32_e32 v31, s19 -; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 m0, s2 ; GCN-NEXT: s_add_u32 s2, s0, 0x70 ; GCN-NEXT: v_movreld_b32_e32 v0, 0 ; GCN-NEXT: s_addc_u32 s3, s1, 0 @@ -868,22 +867,20 @@ entry: define amdgpu_kernel void @double15_inselt(ptr addrspace(1) %out, <15 x double> %vec, i32 %sel) { ; GCN-LABEL: double15_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0xa4 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x114 -; GCN-NEXT: s_load_dwordx4 s[20:23], s[2:3], 0x104 -; GCN-NEXT: s_load_dwordx8 s[24:31], s[2:3], 0xe4 +; GCN-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0xa4 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x114 +; GCN-NEXT: s_load_dwordx4 s[20:23], s[0:1], 0x104 +; GCN-NEXT: s_load_dwordx8 s[24:31], s[0:1], 0xe4 ; GCN-NEXT: v_mov_b32_e32 v32, 0x3ff00000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: s_load_dword s4, s[2:3], 0x124 -; GCN-NEXT: v_mov_b32_e32 v28, s0 -; GCN-NEXT: v_mov_b32_e32 v29, s1 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x124 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: v_mov_b32_e32 v28, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s0, s4, 1 -; GCN-NEXT: s_mov_b32 m0, s0 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_lshl_b32 s2, s4, 1 ; GCN-NEXT: v_mov_b32_e32 v3, s7 ; GCN-NEXT: v_mov_b32_e32 v4, s8 ; GCN-NEXT: v_mov_b32_e32 v5, s9 @@ -909,8 +906,9 @@ define amdgpu_kernel void @double15_inselt(ptr addrspace(1) %out, <15 x double> ; GCN-NEXT: v_mov_b32_e32 v25, s21 ; GCN-NEXT: v_mov_b32_e32 v26, s22 ; GCN-NEXT: v_mov_b32_e32 v27, s23 +; GCN-NEXT: v_mov_b32_e32 v29, s3 +; GCN-NEXT: s_mov_b32 m0, s2 ; GCN-NEXT: v_movreld_b32_e32 v0, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_add_u32 s2, s0, 0x50 ; GCN-NEXT: v_movreld_b32_e32 v1, v32 ; GCN-NEXT: s_addc_u32 s3, s1, 0 @@ -964,13 +962,13 @@ entry: define amdgpu_kernel void @bit4_inselt(ptr addrspace(1) %out, <4 x i1> %vec, i32 %sel) { ; GCN-LABEL: bit4_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN-NEXT: s_mov_b32 s14, -1 -; GCN-NEXT: s_mov_b32 s15, 0xe80000 -; GCN-NEXT: s_add_u32 s12, s12, s9 -; GCN-NEXT: s_addc_u32 s13, s13, 0 +; GCN-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 +; GCN-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s7, 0xe80000 +; GCN-NEXT: s_add_u32 s4, s4, s3 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_addc_u32 s5, s5, 0 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_and_b32 s3, s3, 3 @@ -982,16 +980,16 @@ define amdgpu_kernel void @bit4_inselt(ptr addrspace(1) %out, <4 x i1> %vec, i32 ; GCN-NEXT: v_and_b32_e32 v2, 1, v2 ; GCN-NEXT: v_and_b32_e32 v3, 3, v3 ; GCN-NEXT: v_and_b32_e32 v4, 1, v4 -; GCN-NEXT: buffer_store_byte v1, off, s[12:15], 0 -; GCN-NEXT: buffer_store_byte v4, off, s[12:15], 0 offset:3 -; GCN-NEXT: buffer_store_byte v3, off, s[12:15], 0 offset:2 -; GCN-NEXT: buffer_store_byte v2, off, s[12:15], 0 offset:1 +; GCN-NEXT: buffer_store_byte v1, off, s[4:7], 0 +; GCN-NEXT: buffer_store_byte v4, off, s[4:7], 0 offset:3 +; GCN-NEXT: buffer_store_byte v3, off, s[4:7], 0 offset:2 +; GCN-NEXT: buffer_store_byte v2, off, s[4:7], 0 offset:1 ; GCN-NEXT: v_mov_b32_e32 v1, 1 -; GCN-NEXT: buffer_store_byte v1, v0, s[12:15], 0 offen -; GCN-NEXT: buffer_load_ubyte v0, off, s[12:15], 0 -; GCN-NEXT: buffer_load_ubyte v1, off, s[12:15], 0 offset:1 -; GCN-NEXT: buffer_load_ubyte v2, off, s[12:15], 0 offset:2 -; GCN-NEXT: buffer_load_ubyte v3, off, s[12:15], 0 offset:3 +; GCN-NEXT: buffer_store_byte v1, v0, s[4:7], 0 offen +; GCN-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 +; GCN-NEXT: buffer_load_ubyte v1, off, s[4:7], 0 offset:1 +; GCN-NEXT: buffer_load_ubyte v2, off, s[4:7], 0 offset:2 +; GCN-NEXT: buffer_load_ubyte v3, off, s[4:7], 0 offset:3 ; GCN-NEXT: s_waitcnt vmcnt(3) ; GCN-NEXT: v_and_b32_e32 v0, 1, v0 ; GCN-NEXT: s_waitcnt vmcnt(2) @@ -1019,11 +1017,11 @@ entry: define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, i32 %sel) { ; GCN-LABEL: bit128_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_load_dword s2, s[2:3], 0x44 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x44 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s3, s4, 24 +; GCN-NEXT: s_lshr_b32 s1, s4, 24 ; GCN-NEXT: s_lshr_b32 s8, s4, 16 ; GCN-NEXT: s_lshr_b32 s9, s4, 17 ; GCN-NEXT: s_lshr_b32 s10, s4, 18 @@ -1059,10 +1057,10 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: s_lshr_b32 s41, s7, 21 ; GCN-NEXT: s_lshr_b32 s42, s7, 22 ; GCN-NEXT: s_lshr_b32 s43, s7, 23 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x77 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x77 ; GCN-NEXT: v_mov_b32_e32 v15, s43 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x76 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x76 ; GCN-NEXT: v_cndmask_b32_e32 v15, 1, v15, vcc ; GCN-NEXT: v_mov_b32_e32 v18, s42 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1070,11 +1068,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v18, 1, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v15, 3, v15 ; GCN-NEXT: v_lshlrev_b16_e32 v18, 2, v18 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x75 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x75 ; GCN-NEXT: v_or_b32_e32 v15, v15, v18 ; GCN-NEXT: v_mov_b32_e32 v18, s41 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x74 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x74 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc ; GCN-NEXT: v_mov_b32_e32 v19, s40 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1083,11 +1081,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 ; GCN-NEXT: v_or_b32_e32 v18, v19, v18 ; GCN-NEXT: v_and_b32_e32 v18, 3, v18 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x73 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x73 ; GCN-NEXT: v_or_b32_e32 v15, v18, v15 ; GCN-NEXT: v_mov_b32_e32 v18, s39 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x72 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x72 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc ; GCN-NEXT: v_mov_b32_e32 v19, s38 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1095,11 +1093,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 ; GCN-NEXT: v_lshlrev_b16_e32 v18, 3, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x71 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x71 ; GCN-NEXT: v_or_b32_e32 v18, v18, v19 ; GCN-NEXT: v_mov_b32_e32 v19, s37 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x70 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x70 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc ; GCN-NEXT: v_mov_b32_e32 v20, s36 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1111,11 +1109,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_or_b32_e32 v18, v19, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v15, 4, v15 ; GCN-NEXT: v_and_b32_e32 v18, 15, v18 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x7f +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x7f ; GCN-NEXT: v_or_b32_e32 v15, v18, v15 ; GCN-NEXT: v_lshrrev_b16_e64 v18, 7, s35 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x7e +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x7e ; GCN-NEXT: v_lshrrev_b16_e64 v19, 6, s35 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1123,11 +1121,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 ; GCN-NEXT: v_lshlrev_b16_e32 v18, 3, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x7d +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x7d ; GCN-NEXT: v_or_b32_e32 v18, v18, v19 ; GCN-NEXT: v_lshrrev_b16_e64 v19, 5, s35 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x7c +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x7c ; GCN-NEXT: v_lshrrev_b16_e64 v20, 4, s35 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1136,22 +1134,22 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v20, 1, v20 ; GCN-NEXT: v_or_b32_e32 v19, v20, v19 ; GCN-NEXT: v_and_b32_e32 v19, 3, v19 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x7b +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x7b ; GCN-NEXT: v_or_b32_e32 v18, v19, v18 ; GCN-NEXT: v_lshrrev_b16_e64 v19, 3, s35 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x7a +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x7a ; GCN-NEXT: v_lshrrev_b16_e64 v20, 2, s35 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc ; GCN-NEXT: v_and_b32_e32 v20, 1, v20 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x78 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x78 ; GCN-NEXT: v_mov_b32_e32 v13, s35 ; GCN-NEXT: v_lshlrev_b16_e32 v19, 3, v19 ; GCN-NEXT: v_lshlrev_b16_e32 v20, 2, v20 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x79 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x79 ; GCN-NEXT: v_or_b32_e32 v19, v19, v20 ; GCN-NEXT: v_lshrrev_b16_e64 v20, 1, s35 ; GCN-NEXT: v_cndmask_b32_e32 v13, 1, v13, vcc @@ -1166,11 +1164,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_lshlrev_b16_e32 v18, 12, v18 ; GCN-NEXT: v_and_b32_sdwa v19, v19, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GCN-NEXT: v_or_b32_e32 v18, v18, v19 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x6f +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x6f ; GCN-NEXT: v_or_b32_sdwa v15, v15, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GCN-NEXT: v_lshrrev_b16_e64 v18, 15, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x6e +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x6e ; GCN-NEXT: v_lshrrev_b16_e64 v19, 14, s7 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1178,11 +1176,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 ; GCN-NEXT: v_lshlrev_b16_e32 v18, 3, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x6d +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x6d ; GCN-NEXT: v_or_b32_e32 v18, v18, v19 ; GCN-NEXT: v_lshrrev_b16_e64 v19, 13, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x6c +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x6c ; GCN-NEXT: v_lshrrev_b16_e64 v20, 12, s7 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1191,11 +1189,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v20, 1, v20 ; GCN-NEXT: v_or_b32_e32 v19, v20, v19 ; GCN-NEXT: v_and_b32_e32 v19, 3, v19 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x6b +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x6b ; GCN-NEXT: v_or_b32_e32 v18, v19, v18 ; GCN-NEXT: v_lshrrev_b16_e64 v19, 11, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x6a +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x6a ; GCN-NEXT: v_lshrrev_b16_e64 v20, 10, s7 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1203,11 +1201,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v20, 1, v20 ; GCN-NEXT: v_lshlrev_b16_e32 v19, 3, v19 ; GCN-NEXT: v_lshlrev_b16_e32 v20, 2, v20 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x69 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x69 ; GCN-NEXT: v_or_b32_e32 v19, v19, v20 ; GCN-NEXT: v_lshrrev_b16_e64 v20, 9, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x68 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x68 ; GCN-NEXT: v_lshrrev_b16_e64 v17, 8, s7 ; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1219,11 +1217,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_or_b32_e32 v17, v17, v19 ; GCN-NEXT: v_lshlrev_b16_e32 v18, 12, v18 ; GCN-NEXT: v_and_b32_sdwa v17, v17, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x67 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x67 ; GCN-NEXT: v_or_b32_e32 v17, v18, v17 ; GCN-NEXT: v_lshrrev_b16_e64 v18, 7, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x66 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x66 ; GCN-NEXT: v_lshrrev_b16_e64 v19, 6, s7 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1231,11 +1229,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 ; GCN-NEXT: v_lshlrev_b16_e32 v18, 3, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x65 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x65 ; GCN-NEXT: v_or_b32_e32 v18, v18, v19 ; GCN-NEXT: v_lshrrev_b16_e64 v19, 5, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x64 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x64 ; GCN-NEXT: v_lshrrev_b16_e64 v20, 4, s7 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1244,11 +1242,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v20, 1, v20 ; GCN-NEXT: v_or_b32_e32 v19, v20, v19 ; GCN-NEXT: v_and_b32_e32 v19, 3, v19 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x63 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x63 ; GCN-NEXT: v_or_b32_e32 v18, v19, v18 ; GCN-NEXT: v_lshrrev_b16_e64 v19, 3, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x62 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x62 ; GCN-NEXT: v_lshrrev_b16_e64 v20, 2, s7 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1256,11 +1254,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v20, 1, v20 ; GCN-NEXT: v_lshlrev_b16_e32 v19, 3, v19 ; GCN-NEXT: v_lshlrev_b16_e32 v20, 2, v20 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x61 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x61 ; GCN-NEXT: v_or_b32_e32 v19, v19, v20 ; GCN-NEXT: v_lshrrev_b16_e64 v20, 1, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x60 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x60 ; GCN-NEXT: v_mov_b32_e32 v16, s7 ; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1273,11 +1271,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_lshlrev_b16_e32 v18, 4, v18 ; GCN-NEXT: v_and_b32_e32 v16, 15, v16 ; GCN-NEXT: v_or_b32_e32 v16, v16, v18 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x57 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x57 ; GCN-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GCN-NEXT: v_mov_b32_e32 v17, s34 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x56 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x56 ; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc ; GCN-NEXT: v_mov_b32_e32 v18, s33 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1285,11 +1283,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v18, 1, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v17, 3, v17 ; GCN-NEXT: v_lshlrev_b16_e32 v18, 2, v18 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x55 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x55 ; GCN-NEXT: v_or_b32_e32 v17, v17, v18 ; GCN-NEXT: v_mov_b32_e32 v18, s31 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x54 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x54 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc ; GCN-NEXT: v_mov_b32_e32 v19, s30 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1298,11 +1296,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 ; GCN-NEXT: v_or_b32_e32 v18, v19, v18 ; GCN-NEXT: v_and_b32_e32 v18, 3, v18 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x53 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x53 ; GCN-NEXT: v_or_b32_e32 v17, v18, v17 ; GCN-NEXT: v_mov_b32_e32 v18, s29 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x52 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x52 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc ; GCN-NEXT: v_mov_b32_e32 v19, s28 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1310,11 +1308,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 ; GCN-NEXT: v_lshlrev_b16_e32 v18, 3, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x51 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x51 ; GCN-NEXT: v_or_b32_e32 v18, v18, v19 ; GCN-NEXT: v_mov_b32_e32 v19, s27 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x50 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x50 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc ; GCN-NEXT: v_mov_b32_e32 v20, s26 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1326,11 +1324,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_or_b32_e32 v18, v19, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v17, 4, v17 ; GCN-NEXT: v_and_b32_e32 v18, 15, v18 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x5f +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x5f ; GCN-NEXT: v_or_b32_e32 v17, v18, v17 ; GCN-NEXT: v_lshrrev_b16_e64 v18, 7, s25 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x5e +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x5e ; GCN-NEXT: v_lshrrev_b16_e64 v19, 6, s25 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1338,11 +1336,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 ; GCN-NEXT: v_lshlrev_b16_e32 v18, 3, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x5d +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x5d ; GCN-NEXT: v_or_b32_e32 v18, v18, v19 ; GCN-NEXT: v_lshrrev_b16_e64 v19, 5, s25 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x5c +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x5c ; GCN-NEXT: v_lshrrev_b16_e64 v20, 4, s25 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1351,22 +1349,22 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v20, 1, v20 ; GCN-NEXT: v_or_b32_e32 v19, v20, v19 ; GCN-NEXT: v_and_b32_e32 v19, 3, v19 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x5b +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x5b ; GCN-NEXT: v_or_b32_e32 v18, v19, v18 ; GCN-NEXT: v_lshrrev_b16_e64 v19, 3, s25 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x5a +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x5a ; GCN-NEXT: v_lshrrev_b16_e64 v20, 2, s25 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc ; GCN-NEXT: v_and_b32_e32 v20, 1, v20 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x58 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x58 ; GCN-NEXT: v_mov_b32_e32 v3, s25 ; GCN-NEXT: v_lshlrev_b16_e32 v19, 3, v19 ; GCN-NEXT: v_lshlrev_b16_e32 v20, 2, v20 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x59 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x59 ; GCN-NEXT: v_or_b32_e32 v19, v19, v20 ; GCN-NEXT: v_lshrrev_b16_e64 v20, 1, s25 ; GCN-NEXT: v_cndmask_b32_e32 v3, 1, v3, vcc @@ -1380,11 +1378,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_lshlrev_b16_e32 v18, 12, v18 ; GCN-NEXT: v_and_b32_sdwa v3, v3, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GCN-NEXT: v_or_b32_e32 v3, v18, v3 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x4f +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x4f ; GCN-NEXT: v_or_b32_sdwa v17, v17, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GCN-NEXT: v_lshrrev_b16_e64 v3, 15, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x4e +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x4e ; GCN-NEXT: v_lshrrev_b16_e64 v18, 14, s6 ; GCN-NEXT: v_cndmask_b32_e32 v3, 1, v3, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1392,11 +1390,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v18, 1, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v3, 3, v3 ; GCN-NEXT: v_lshlrev_b16_e32 v18, 2, v18 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x4d +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x4d ; GCN-NEXT: v_or_b32_e32 v3, v3, v18 ; GCN-NEXT: v_lshrrev_b16_e64 v18, 13, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x4c +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x4c ; GCN-NEXT: v_lshrrev_b16_e64 v19, 12, s6 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1405,11 +1403,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 ; GCN-NEXT: v_or_b32_e32 v18, v19, v18 ; GCN-NEXT: v_and_b32_e32 v18, 3, v18 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x4b +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x4b ; GCN-NEXT: v_or_b32_e32 v3, v18, v3 ; GCN-NEXT: v_lshrrev_b16_e64 v18, 11, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x4a +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x4a ; GCN-NEXT: v_lshrrev_b16_e64 v19, 10, s6 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1417,11 +1415,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 ; GCN-NEXT: v_lshlrev_b16_e32 v18, 3, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x49 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x49 ; GCN-NEXT: v_or_b32_e32 v18, v18, v19 ; GCN-NEXT: v_lshrrev_b16_e64 v19, 9, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x48 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x48 ; GCN-NEXT: v_lshrrev_b16_e64 v20, 8, s6 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1433,11 +1431,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_or_b32_e32 v18, v19, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v3, 12, v3 ; GCN-NEXT: v_and_b32_sdwa v18, v18, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x47 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x47 ; GCN-NEXT: v_or_b32_e32 v18, v3, v18 ; GCN-NEXT: v_lshrrev_b16_e64 v3, 7, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x46 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x46 ; GCN-NEXT: v_lshrrev_b16_e64 v19, 6, s6 ; GCN-NEXT: v_cndmask_b32_e32 v3, 1, v3, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1445,11 +1443,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 ; GCN-NEXT: v_lshlrev_b16_e32 v3, 3, v3 ; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x45 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x45 ; GCN-NEXT: v_or_b32_e32 v3, v3, v19 ; GCN-NEXT: v_lshrrev_b16_e64 v19, 5, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x44 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x44 ; GCN-NEXT: v_lshrrev_b16_e64 v20, 4, s6 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1458,11 +1456,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v20, 1, v20 ; GCN-NEXT: v_or_b32_e32 v19, v20, v19 ; GCN-NEXT: v_and_b32_e32 v19, 3, v19 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x43 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x43 ; GCN-NEXT: v_or_b32_e32 v19, v19, v3 ; GCN-NEXT: v_lshrrev_b16_e64 v3, 3, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x42 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x42 ; GCN-NEXT: v_lshrrev_b16_e64 v20, 2, s6 ; GCN-NEXT: v_cndmask_b32_e32 v3, 1, v3, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1470,11 +1468,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v20, 1, v20 ; GCN-NEXT: v_lshlrev_b16_e32 v3, 3, v3 ; GCN-NEXT: v_lshlrev_b16_e32 v20, 2, v20 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x41 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x41 ; GCN-NEXT: v_or_b32_e32 v3, v3, v20 ; GCN-NEXT: v_lshrrev_b16_e64 v20, 1, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 64 +; GCN-NEXT: s_cmp_lg_u32 s0, 64 ; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1487,11 +1485,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_or_b32_sdwa v3, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GCN-NEXT: v_lshlrev_b16_e32 v15, 4, v19 ; GCN-NEXT: v_and_b32_e32 v2, 15, v2 -; GCN-NEXT: s_cmp_lg_u32 s2, 55 +; GCN-NEXT: s_cmp_lg_u32 s0, 55 ; GCN-NEXT: v_or_b32_e32 v2, v2, v15 ; GCN-NEXT: v_mov_b32_e32 v15, s24 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 54 +; GCN-NEXT: s_cmp_lg_u32 s0, 54 ; GCN-NEXT: v_cndmask_b32_e32 v15, 1, v15, vcc ; GCN-NEXT: v_mov_b32_e32 v16, s23 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1499,12 +1497,12 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v16, 1, v16 ; GCN-NEXT: v_lshlrev_b16_e32 v15, 3, v15 ; GCN-NEXT: v_lshlrev_b16_e32 v16, 2, v16 -; GCN-NEXT: s_cmp_lg_u32 s2, 53 +; GCN-NEXT: s_cmp_lg_u32 s0, 53 ; GCN-NEXT: v_or_b32_sdwa v2, v2, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GCN-NEXT: v_or_b32_e32 v15, v15, v16 ; GCN-NEXT: v_mov_b32_e32 v16, s22 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 52 +; GCN-NEXT: s_cmp_lg_u32 s0, 52 ; GCN-NEXT: v_or_b32_sdwa v2, v2, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc ; GCN-NEXT: v_mov_b32_e32 v17, s21 @@ -1514,11 +1512,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v17, 1, v17 ; GCN-NEXT: v_or_b32_e32 v16, v17, v16 ; GCN-NEXT: v_and_b32_e32 v16, 3, v16 -; GCN-NEXT: s_cmp_lg_u32 s2, 51 +; GCN-NEXT: s_cmp_lg_u32 s0, 51 ; GCN-NEXT: v_or_b32_e32 v15, v16, v15 ; GCN-NEXT: v_mov_b32_e32 v16, s20 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 50 +; GCN-NEXT: s_cmp_lg_u32 s0, 50 ; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc ; GCN-NEXT: v_mov_b32_e32 v17, s19 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1526,11 +1524,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v17, 1, v17 ; GCN-NEXT: v_lshlrev_b16_e32 v16, 3, v16 ; GCN-NEXT: v_lshlrev_b16_e32 v17, 2, v17 -; GCN-NEXT: s_cmp_lg_u32 s2, 49 +; GCN-NEXT: s_cmp_lg_u32 s0, 49 ; GCN-NEXT: v_or_b32_e32 v16, v16, v17 ; GCN-NEXT: v_mov_b32_e32 v17, s18 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 48 +; GCN-NEXT: s_cmp_lg_u32 s0, 48 ; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc ; GCN-NEXT: v_mov_b32_e32 v18, s17 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1542,11 +1540,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_or_b32_e32 v16, v17, v16 ; GCN-NEXT: v_lshlrev_b16_e32 v15, 4, v15 ; GCN-NEXT: v_and_b32_e32 v16, 15, v16 -; GCN-NEXT: s_cmp_lg_u32 s2, 63 +; GCN-NEXT: s_cmp_lg_u32 s0, 63 ; GCN-NEXT: v_or_b32_e32 v15, v16, v15 ; GCN-NEXT: v_lshrrev_b16_e64 v16, 7, s16 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 62 +; GCN-NEXT: s_cmp_lg_u32 s0, 62 ; GCN-NEXT: v_lshrrev_b16_e64 v17, 6, s16 ; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1554,11 +1552,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v17, 1, v17 ; GCN-NEXT: v_lshlrev_b16_e32 v16, 3, v16 ; GCN-NEXT: v_lshlrev_b16_e32 v17, 2, v17 -; GCN-NEXT: s_cmp_lg_u32 s2, 61 +; GCN-NEXT: s_cmp_lg_u32 s0, 61 ; GCN-NEXT: v_or_b32_e32 v16, v16, v17 ; GCN-NEXT: v_lshrrev_b16_e64 v17, 5, s16 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 60 +; GCN-NEXT: s_cmp_lg_u32 s0, 60 ; GCN-NEXT: v_lshrrev_b16_e64 v18, 4, s16 ; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1567,22 +1565,22 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v18, 1, v18 ; GCN-NEXT: v_or_b32_e32 v17, v18, v17 ; GCN-NEXT: v_and_b32_e32 v17, 3, v17 -; GCN-NEXT: s_cmp_lg_u32 s2, 59 +; GCN-NEXT: s_cmp_lg_u32 s0, 59 ; GCN-NEXT: v_or_b32_e32 v16, v17, v16 ; GCN-NEXT: v_lshrrev_b16_e64 v17, 3, s16 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 58 +; GCN-NEXT: s_cmp_lg_u32 s0, 58 ; GCN-NEXT: v_lshrrev_b16_e64 v18, 2, s16 ; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc ; GCN-NEXT: v_and_b32_e32 v18, 1, v18 -; GCN-NEXT: s_cmp_lg_u32 s2, 56 +; GCN-NEXT: s_cmp_lg_u32 s0, 56 ; GCN-NEXT: v_mov_b32_e32 v14, s16 ; GCN-NEXT: v_lshlrev_b16_e32 v17, 3, v17 ; GCN-NEXT: v_lshlrev_b16_e32 v18, 2, v18 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 57 +; GCN-NEXT: s_cmp_lg_u32 s0, 57 ; GCN-NEXT: v_or_b32_e32 v17, v17, v18 ; GCN-NEXT: v_lshrrev_b16_e64 v18, 1, s16 ; GCN-NEXT: v_cndmask_b32_e32 v14, 1, v14, vcc @@ -1596,11 +1594,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_lshlrev_b16_e32 v16, 12, v16 ; GCN-NEXT: v_and_b32_sdwa v14, v14, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GCN-NEXT: v_or_b32_e32 v14, v16, v14 -; GCN-NEXT: s_cmp_lg_u32 s2, 47 +; GCN-NEXT: s_cmp_lg_u32 s0, 47 ; GCN-NEXT: v_or_b32_sdwa v15, v15, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GCN-NEXT: v_lshrrev_b16_e64 v14, 15, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 46 +; GCN-NEXT: s_cmp_lg_u32 s0, 46 ; GCN-NEXT: v_lshrrev_b16_e64 v16, 14, s5 ; GCN-NEXT: v_cndmask_b32_e32 v14, 1, v14, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1608,11 +1606,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v16, 1, v16 ; GCN-NEXT: v_lshlrev_b16_e32 v14, 3, v14 ; GCN-NEXT: v_lshlrev_b16_e32 v16, 2, v16 -; GCN-NEXT: s_cmp_lg_u32 s2, 45 +; GCN-NEXT: s_cmp_lg_u32 s0, 45 ; GCN-NEXT: v_or_b32_e32 v14, v14, v16 ; GCN-NEXT: v_lshrrev_b16_e64 v16, 13, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 44 +; GCN-NEXT: s_cmp_lg_u32 s0, 44 ; GCN-NEXT: v_lshrrev_b16_e64 v17, 12, s5 ; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1621,11 +1619,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v17, 1, v17 ; GCN-NEXT: v_or_b32_e32 v16, v17, v16 ; GCN-NEXT: v_and_b32_e32 v16, 3, v16 -; GCN-NEXT: s_cmp_lg_u32 s2, 43 +; GCN-NEXT: s_cmp_lg_u32 s0, 43 ; GCN-NEXT: v_or_b32_e32 v14, v16, v14 ; GCN-NEXT: v_lshrrev_b16_e64 v16, 11, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 42 +; GCN-NEXT: s_cmp_lg_u32 s0, 42 ; GCN-NEXT: v_lshrrev_b16_e64 v17, 10, s5 ; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1633,11 +1631,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v17, 1, v17 ; GCN-NEXT: v_lshlrev_b16_e32 v16, 3, v16 ; GCN-NEXT: v_lshlrev_b16_e32 v17, 2, v17 -; GCN-NEXT: s_cmp_lg_u32 s2, 41 +; GCN-NEXT: s_cmp_lg_u32 s0, 41 ; GCN-NEXT: v_or_b32_e32 v16, v16, v17 ; GCN-NEXT: v_lshrrev_b16_e64 v17, 9, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 40 +; GCN-NEXT: s_cmp_lg_u32 s0, 40 ; GCN-NEXT: v_lshrrev_b16_e64 v18, 8, s5 ; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1649,11 +1647,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_or_b32_e32 v16, v17, v16 ; GCN-NEXT: v_lshlrev_b16_e32 v14, 12, v14 ; GCN-NEXT: v_and_b32_sdwa v16, v16, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GCN-NEXT: s_cmp_lg_u32 s2, 39 +; GCN-NEXT: s_cmp_lg_u32 s0, 39 ; GCN-NEXT: v_or_b32_e32 v16, v14, v16 ; GCN-NEXT: v_lshrrev_b16_e64 v14, 7, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 38 +; GCN-NEXT: s_cmp_lg_u32 s0, 38 ; GCN-NEXT: v_lshrrev_b16_e64 v17, 6, s5 ; GCN-NEXT: v_cndmask_b32_e32 v14, 1, v14, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1661,11 +1659,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v17, 1, v17 ; GCN-NEXT: v_lshlrev_b16_e32 v14, 3, v14 ; GCN-NEXT: v_lshlrev_b16_e32 v17, 2, v17 -; GCN-NEXT: s_cmp_lg_u32 s2, 37 +; GCN-NEXT: s_cmp_lg_u32 s0, 37 ; GCN-NEXT: v_or_b32_e32 v14, v14, v17 ; GCN-NEXT: v_lshrrev_b16_e64 v17, 5, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 36 +; GCN-NEXT: s_cmp_lg_u32 s0, 36 ; GCN-NEXT: v_lshrrev_b16_e64 v18, 4, s5 ; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1674,11 +1672,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v18, 1, v18 ; GCN-NEXT: v_or_b32_e32 v17, v18, v17 ; GCN-NEXT: v_and_b32_e32 v17, 3, v17 -; GCN-NEXT: s_cmp_lg_u32 s2, 35 +; GCN-NEXT: s_cmp_lg_u32 s0, 35 ; GCN-NEXT: v_or_b32_e32 v17, v17, v14 ; GCN-NEXT: v_lshrrev_b16_e64 v14, 3, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 34 +; GCN-NEXT: s_cmp_lg_u32 s0, 34 ; GCN-NEXT: v_lshrrev_b16_e64 v18, 2, s5 ; GCN-NEXT: v_cndmask_b32_e32 v14, 1, v14, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1686,11 +1684,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v18, 1, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v14, 3, v14 ; GCN-NEXT: v_lshlrev_b16_e32 v18, 2, v18 -; GCN-NEXT: s_cmp_lg_u32 s2, 33 +; GCN-NEXT: s_cmp_lg_u32 s0, 33 ; GCN-NEXT: v_or_b32_e32 v18, v14, v18 ; GCN-NEXT: v_lshrrev_b16_e64 v14, 1, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 32 +; GCN-NEXT: s_cmp_lg_u32 s0, 32 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: v_cndmask_b32_e32 v14, 1, v14, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1704,11 +1702,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v1, 15, v1 ; GCN-NEXT: v_or_b32_e32 v1, v1, v17 ; GCN-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NEXT: s_cmp_lg_u32 s2, 23 +; GCN-NEXT: s_cmp_lg_u32 s0, 23 ; GCN-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GCN-NEXT: v_mov_b32_e32 v15, s15 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 22 +; GCN-NEXT: s_cmp_lg_u32 s0, 22 ; GCN-NEXT: v_cndmask_b32_e32 v15, 1, v15, vcc ; GCN-NEXT: v_mov_b32_e32 v16, s14 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1716,11 +1714,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v16, 1, v16 ; GCN-NEXT: v_lshlrev_b16_e32 v15, 3, v15 ; GCN-NEXT: v_lshlrev_b16_e32 v16, 2, v16 -; GCN-NEXT: s_cmp_lg_u32 s2, 21 +; GCN-NEXT: s_cmp_lg_u32 s0, 21 ; GCN-NEXT: v_or_b32_e32 v15, v15, v16 ; GCN-NEXT: v_mov_b32_e32 v16, s13 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 20 +; GCN-NEXT: s_cmp_lg_u32 s0, 20 ; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc ; GCN-NEXT: v_mov_b32_e32 v17, s12 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1729,11 +1727,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v17, 1, v17 ; GCN-NEXT: v_or_b32_e32 v16, v17, v16 ; GCN-NEXT: v_and_b32_e32 v16, 3, v16 -; GCN-NEXT: s_cmp_lg_u32 s2, 19 +; GCN-NEXT: s_cmp_lg_u32 s0, 19 ; GCN-NEXT: v_or_b32_e32 v15, v16, v15 ; GCN-NEXT: v_mov_b32_e32 v16, s11 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 18 +; GCN-NEXT: s_cmp_lg_u32 s0, 18 ; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc ; GCN-NEXT: v_mov_b32_e32 v17, s10 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1741,11 +1739,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v17, 1, v17 ; GCN-NEXT: v_lshlrev_b16_e32 v16, 3, v16 ; GCN-NEXT: v_lshlrev_b16_e32 v17, 2, v17 -; GCN-NEXT: s_cmp_lg_u32 s2, 17 +; GCN-NEXT: s_cmp_lg_u32 s0, 17 ; GCN-NEXT: v_or_b32_e32 v16, v16, v17 ; GCN-NEXT: v_mov_b32_e32 v17, s9 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 16 +; GCN-NEXT: s_cmp_lg_u32 s0, 16 ; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc ; GCN-NEXT: v_mov_b32_e32 v19, s8 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1757,24 +1755,24 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_or_b32_e32 v16, v17, v16 ; GCN-NEXT: v_lshlrev_b16_e32 v15, 4, v15 ; GCN-NEXT: v_and_b32_e32 v16, 15, v16 -; GCN-NEXT: s_cmp_lg_u32 s2, 31 +; GCN-NEXT: s_cmp_lg_u32 s0, 31 ; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: v_lshrrev_b16_e64 v16, 7, s3 +; GCN-NEXT: v_lshrrev_b16_e64 v16, 7, s1 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 30 -; GCN-NEXT: v_lshrrev_b16_e64 v17, 6, s3 +; GCN-NEXT: s_cmp_lg_u32 s0, 30 +; GCN-NEXT: v_lshrrev_b16_e64 v17, 6, s1 ; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc ; GCN-NEXT: v_and_b32_e32 v17, 1, v17 ; GCN-NEXT: v_lshlrev_b16_e32 v16, 3, v16 ; GCN-NEXT: v_lshlrev_b16_e32 v17, 2, v17 -; GCN-NEXT: s_cmp_lg_u32 s2, 29 +; GCN-NEXT: s_cmp_lg_u32 s0, 29 ; GCN-NEXT: v_or_b32_e32 v16, v16, v17 -; GCN-NEXT: v_lshrrev_b16_e64 v17, 5, s3 +; GCN-NEXT: v_lshrrev_b16_e64 v17, 5, s1 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 28 -; GCN-NEXT: v_lshrrev_b16_e64 v19, 4, s3 +; GCN-NEXT: s_cmp_lg_u32 s0, 28 +; GCN-NEXT: v_lshrrev_b16_e64 v19, 4, s1 ; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc @@ -1782,24 +1780,24 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 ; GCN-NEXT: v_or_b32_e32 v17, v19, v17 ; GCN-NEXT: v_and_b32_e32 v17, 3, v17 -; GCN-NEXT: s_cmp_lg_u32 s2, 27 +; GCN-NEXT: s_cmp_lg_u32 s0, 27 ; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_lshrrev_b16_e64 v17, 3, s3 +; GCN-NEXT: v_lshrrev_b16_e64 v17, 3, s1 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 26 -; GCN-NEXT: v_lshrrev_b16_e64 v19, 2, s3 +; GCN-NEXT: s_cmp_lg_u32 s0, 26 +; GCN-NEXT: v_lshrrev_b16_e64 v19, 2, s1 ; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 -; GCN-NEXT: s_cmp_lg_u32 s2, 24 -; GCN-NEXT: v_mov_b32_e32 v18, s3 +; GCN-NEXT: s_cmp_lg_u32 s0, 24 +; GCN-NEXT: v_mov_b32_e32 v18, s1 ; GCN-NEXT: v_lshlrev_b16_e32 v17, 3, v17 ; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 25 +; GCN-NEXT: s_cmp_lg_u32 s0, 25 ; GCN-NEXT: v_or_b32_e32 v17, v17, v19 -; GCN-NEXT: v_lshrrev_b16_e64 v19, 1, s3 +; GCN-NEXT: v_lshrrev_b16_e64 v19, 1, s1 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc @@ -1811,11 +1809,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_lshlrev_b16_e32 v16, 12, v16 ; GCN-NEXT: v_and_b32_sdwa v17, v17, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GCN-NEXT: v_or_b32_e32 v16, v16, v17 -; GCN-NEXT: s_cmp_lg_u32 s2, 15 +; GCN-NEXT: s_cmp_lg_u32 s0, 15 ; GCN-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GCN-NEXT: v_lshrrev_b16_e64 v16, 15, s4 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 14 +; GCN-NEXT: s_cmp_lg_u32 s0, 14 ; GCN-NEXT: v_lshrrev_b16_e64 v17, 14, s4 ; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1823,11 +1821,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v17, 1, v17 ; GCN-NEXT: v_lshlrev_b16_e32 v16, 3, v16 ; GCN-NEXT: v_lshlrev_b16_e32 v17, 2, v17 -; GCN-NEXT: s_cmp_lg_u32 s2, 13 +; GCN-NEXT: s_cmp_lg_u32 s0, 13 ; GCN-NEXT: v_or_b32_e32 v16, v16, v17 ; GCN-NEXT: v_lshrrev_b16_e64 v17, 13, s4 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 12 +; GCN-NEXT: s_cmp_lg_u32 s0, 12 ; GCN-NEXT: v_lshrrev_b16_e64 v18, 12, s4 ; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1835,52 +1833,52 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_lshlrev_b16_e32 v17, 1, v17 ; GCN-NEXT: v_and_b32_e32 v18, 1, v18 ; GCN-NEXT: v_or_b32_e32 v17, v18, v17 -; GCN-NEXT: s_cmp_lg_u32 s2, 11 +; GCN-NEXT: s_cmp_lg_u32 s0, 11 ; GCN-NEXT: v_lshrrev_b16_e64 v19, 11, s4 ; GCN-NEXT: v_and_b32_e32 v17, 3, v17 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 10 +; GCN-NEXT: s_cmp_lg_u32 s0, 10 ; GCN-NEXT: v_lshrrev_b16_e64 v14, 10, s4 ; GCN-NEXT: v_or_b32_e32 v16, v17, v16 ; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v19, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 9 +; GCN-NEXT: s_cmp_lg_u32 s0, 9 ; GCN-NEXT: v_lshrrev_b16_e64 v12, 9, s4 ; GCN-NEXT: v_cndmask_b32_e32 v14, 1, v14, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 8 +; GCN-NEXT: s_cmp_lg_u32 s0, 8 ; GCN-NEXT: v_lshrrev_b16_e64 v11, 8, s4 ; GCN-NEXT: v_cndmask_b32_e32 v12, 1, v12, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 7 +; GCN-NEXT: s_cmp_lg_u32 s0, 7 ; GCN-NEXT: v_lshrrev_b16_e64 v10, 7, s4 ; GCN-NEXT: v_cndmask_b32_e32 v11, 1, v11, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 6 +; GCN-NEXT: s_cmp_lg_u32 s0, 6 ; GCN-NEXT: v_lshrrev_b16_e64 v9, 6, s4 ; GCN-NEXT: v_cndmask_b32_e32 v10, 1, v10, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 5 +; GCN-NEXT: s_cmp_lg_u32 s0, 5 ; GCN-NEXT: v_lshrrev_b16_e64 v8, 5, s4 ; GCN-NEXT: v_cndmask_b32_e32 v9, 1, v9, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 4 +; GCN-NEXT: s_cmp_lg_u32 s0, 4 ; GCN-NEXT: v_lshrrev_b16_e64 v7, 4, s4 ; GCN-NEXT: v_cndmask_b32_e32 v8, 1, v8, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 3 +; GCN-NEXT: s_cmp_lg_u32 s0, 3 ; GCN-NEXT: v_lshrrev_b16_e64 v6, 3, s4 ; GCN-NEXT: v_cndmask_b32_e32 v7, 1, v7, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 2 +; GCN-NEXT: s_cmp_lg_u32 s0, 2 ; GCN-NEXT: v_lshrrev_b16_e64 v5, 2, s4 ; GCN-NEXT: v_cndmask_b32_e32 v6, 1, v6, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 1 +; GCN-NEXT: s_cmp_lg_u32 s0, 1 ; GCN-NEXT: v_lshrrev_b16_e64 v4, 1, s4 ; GCN-NEXT: v_cndmask_b32_e32 v5, 1, v5, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_cndmask_b32_e32 v4, 1, v4, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1919,9 +1917,9 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_or_b32_e32 v11, v16, v11 ; GCN-NEXT: v_or_b32_e32 v0, v0, v7 ; GCN-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll index b9dc27cb7e019..d5265e364a17e 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll @@ -130,9 +130,8 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) { ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -291,9 +290,8 @@ define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v2, v3 ; GFX12-NEXT: v_or_b32_e32 v2, -5, v2 -; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -438,127 +436,127 @@ entry: define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; GFX9-LABEL: udiv_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX9-NEXT: s_sub_i32 s4, 0, s3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX9-NEXT: s_sub_i32 s0, 0, s7 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s5, v0 -; GFX9-NEXT: s_mul_i32 s4, s4, s5 -; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4 -; GFX9-NEXT: s_add_i32 s5, s5, s4 -; GFX9-NEXT: s_mul_hi_u32 s4, s2, s5 -; GFX9-NEXT: s_mul_i32 s5, s4, s3 -; GFX9-NEXT: s_sub_i32 s2, s2, s5 -; GFX9-NEXT: s_add_i32 s6, s4, 1 -; GFX9-NEXT: s_sub_i32 s5, s2, s3 -; GFX9-NEXT: s_cmp_ge_u32 s2, s3 -; GFX9-NEXT: s_cselect_b32 s4, s6, s4 -; GFX9-NEXT: s_cselect_b32 s2, s5, s2 -; GFX9-NEXT: s_add_i32 s5, s4, 1 -; GFX9-NEXT: s_cmp_ge_u32 s2, s3 -; GFX9-NEXT: s_cselect_b32 s2, s5, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: v_readfirstlane_b32 s1, v0 +; GFX9-NEXT: s_mul_i32 s0, s0, s1 +; GFX9-NEXT: s_mul_hi_u32 s0, s1, s0 +; GFX9-NEXT: s_add_i32 s1, s1, s0 +; GFX9-NEXT: s_mul_hi_u32 s0, s6, s1 +; GFX9-NEXT: s_mul_i32 s1, s0, s7 +; GFX9-NEXT: s_sub_i32 s1, s6, s1 +; GFX9-NEXT: s_add_i32 s2, s0, 1 +; GFX9-NEXT: s_sub_i32 s3, s1, s7 +; GFX9-NEXT: s_cmp_ge_u32 s1, s7 +; GFX9-NEXT: s_cselect_b32 s0, s2, s0 +; GFX9-NEXT: s_cselect_b32 s1, s3, s1 +; GFX9-NEXT: s_add_i32 s2, s0, 1 +; GFX9-NEXT: s_cmp_ge_u32 s1, s7 +; GFX9-NEXT: s_cselect_b32 s0, s2, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; ; GFX90A-LABEL: udiv_i32: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, 0 -; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX90A-NEXT: s_sub_i32 s4, 0, s3 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX90A-NEXT: s_sub_i32 s0, 0, s7 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX90A-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX90A-NEXT: v_readfirstlane_b32 s5, v0 -; GFX90A-NEXT: s_mul_i32 s4, s4, s5 -; GFX90A-NEXT: s_mul_hi_u32 s4, s5, s4 -; GFX90A-NEXT: s_add_i32 s5, s5, s4 -; GFX90A-NEXT: s_mul_hi_u32 s4, s2, s5 -; GFX90A-NEXT: s_mul_i32 s5, s4, s3 -; GFX90A-NEXT: s_sub_i32 s2, s2, s5 -; GFX90A-NEXT: s_add_i32 s6, s4, 1 -; GFX90A-NEXT: s_sub_i32 s5, s2, s3 -; GFX90A-NEXT: s_cmp_ge_u32 s2, s3 -; GFX90A-NEXT: s_cselect_b32 s4, s6, s4 -; GFX90A-NEXT: s_cselect_b32 s2, s5, s2 -; GFX90A-NEXT: s_add_i32 s5, s4, 1 -; GFX90A-NEXT: s_cmp_ge_u32 s2, s3 -; GFX90A-NEXT: s_cselect_b32 s2, s5, s4 -; GFX90A-NEXT: v_mov_b32_e32 v0, s2 -; GFX90A-NEXT: global_store_dword v1, v0, s[0:1] +; GFX90A-NEXT: v_readfirstlane_b32 s1, v0 +; GFX90A-NEXT: s_mul_i32 s0, s0, s1 +; GFX90A-NEXT: s_mul_hi_u32 s0, s1, s0 +; GFX90A-NEXT: s_add_i32 s1, s1, s0 +; GFX90A-NEXT: s_mul_hi_u32 s0, s6, s1 +; GFX90A-NEXT: s_mul_i32 s1, s0, s7 +; GFX90A-NEXT: s_sub_i32 s1, s6, s1 +; GFX90A-NEXT: s_add_i32 s2, s0, 1 +; GFX90A-NEXT: s_sub_i32 s3, s1, s7 +; GFX90A-NEXT: s_cmp_ge_u32 s1, s7 +; GFX90A-NEXT: s_cselect_b32 s0, s2, s0 +; GFX90A-NEXT: s_cselect_b32 s1, s3, s1 +; GFX90A-NEXT: s_add_i32 s2, s0, 1 +; GFX90A-NEXT: s_cmp_ge_u32 s1, s7 +; GFX90A-NEXT: s_cselect_b32 s0, s2, s0 +; GFX90A-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NEXT: global_store_dword v1, v0, s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_endpgm ; ; GFX10-LABEL: udiv_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX10-NEXT: s_sub_i32 s5, 0, s3 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX10-NEXT: s_sub_i32 s1, 0, s7 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX10-NEXT: v_readfirstlane_b32 s4, v0 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_mul_i32 s5, s5, s4 -; GFX10-NEXT: s_mul_hi_u32 s5, s4, s5 -; GFX10-NEXT: s_add_i32 s4, s4, s5 -; GFX10-NEXT: s_mul_hi_u32 s4, s2, s4 -; GFX10-NEXT: s_mul_i32 s5, s4, s3 -; GFX10-NEXT: s_sub_i32 s2, s2, s5 -; GFX10-NEXT: s_add_i32 s5, s4, 1 -; GFX10-NEXT: s_sub_i32 s6, s2, s3 -; GFX10-NEXT: s_cmp_ge_u32 s2, s3 -; GFX10-NEXT: s_cselect_b32 s4, s5, s4 -; GFX10-NEXT: s_cselect_b32 s2, s6, s2 -; GFX10-NEXT: s_add_i32 s5, s4, 1 -; GFX10-NEXT: s_cmp_ge_u32 s2, s3 -; GFX10-NEXT: s_cselect_b32 s2, s5, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_mul_i32 s1, s1, s0 +; GFX10-NEXT: s_mul_hi_u32 s1, s0, s1 +; GFX10-NEXT: s_add_i32 s0, s0, s1 +; GFX10-NEXT: s_mul_hi_u32 s0, s6, s0 +; GFX10-NEXT: s_mul_i32 s1, s0, s7 +; GFX10-NEXT: s_add_i32 s2, s0, 1 +; GFX10-NEXT: s_sub_i32 s1, s6, s1 +; GFX10-NEXT: s_sub_i32 s3, s1, s7 +; GFX10-NEXT: s_cmp_ge_u32 s1, s7 +; GFX10-NEXT: s_cselect_b32 s0, s2, s0 +; GFX10-NEXT: s_cselect_b32 s1, s3, s1 +; GFX10-NEXT: s_add_i32 s2, s0, 1 +; GFX10-NEXT: s_cmp_ge_u32 s1, s7 +; GFX10-NEXT: s_cselect_b32 s0, s2, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_endpgm ; ; GFX9-FLATSCR-LABEL: udiv_i32: ; GFX9-FLATSCR: ; %bb.0: -; GFX9-FLATSCR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-FLATSCR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-FLATSCR-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX9-FLATSCR-NEXT: s_sub_i32 s4, 0, s3 +; GFX9-FLATSCR-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX9-FLATSCR-NEXT: s_sub_i32 s0, 0, s7 ; GFX9-FLATSCR-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-FLATSCR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-FLATSCR-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-FLATSCR-NEXT: v_readfirstlane_b32 s5, v0 -; GFX9-FLATSCR-NEXT: s_mul_i32 s4, s4, s5 -; GFX9-FLATSCR-NEXT: s_mul_hi_u32 s4, s5, s4 -; GFX9-FLATSCR-NEXT: s_add_i32 s5, s5, s4 -; GFX9-FLATSCR-NEXT: s_mul_hi_u32 s4, s2, s5 -; GFX9-FLATSCR-NEXT: s_mul_i32 s5, s4, s3 -; GFX9-FLATSCR-NEXT: s_sub_i32 s2, s2, s5 -; GFX9-FLATSCR-NEXT: s_add_i32 s6, s4, 1 -; GFX9-FLATSCR-NEXT: s_sub_i32 s5, s2, s3 -; GFX9-FLATSCR-NEXT: s_cmp_ge_u32 s2, s3 -; GFX9-FLATSCR-NEXT: s_cselect_b32 s4, s6, s4 -; GFX9-FLATSCR-NEXT: s_cselect_b32 s2, s5, s2 -; GFX9-FLATSCR-NEXT: s_add_i32 s5, s4, 1 -; GFX9-FLATSCR-NEXT: s_cmp_ge_u32 s2, s3 -; GFX9-FLATSCR-NEXT: s_cselect_b32 s2, s5, s4 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-FLATSCR-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-FLATSCR-NEXT: v_readfirstlane_b32 s1, v0 +; GFX9-FLATSCR-NEXT: s_mul_i32 s0, s0, s1 +; GFX9-FLATSCR-NEXT: s_mul_hi_u32 s0, s1, s0 +; GFX9-FLATSCR-NEXT: s_add_i32 s1, s1, s0 +; GFX9-FLATSCR-NEXT: s_mul_hi_u32 s0, s6, s1 +; GFX9-FLATSCR-NEXT: s_mul_i32 s1, s0, s7 +; GFX9-FLATSCR-NEXT: s_sub_i32 s1, s6, s1 +; GFX9-FLATSCR-NEXT: s_add_i32 s2, s0, 1 +; GFX9-FLATSCR-NEXT: s_sub_i32 s3, s1, s7 +; GFX9-FLATSCR-NEXT: s_cmp_ge_u32 s1, s7 +; GFX9-FLATSCR-NEXT: s_cselect_b32 s0, s2, s0 +; GFX9-FLATSCR-NEXT: s_cselect_b32 s1, s3, s1 +; GFX9-FLATSCR-NEXT: s_add_i32 s2, s0, 1 +; GFX9-FLATSCR-NEXT: s_cmp_ge_u32 s1, s7 +; GFX9-FLATSCR-NEXT: s_cselect_b32 s0, s2, s0 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-FLATSCR-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: s_endpgm ; ; GFX11-LABEL: udiv_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s3 ; GFX11-NEXT: s_sub_i32 s5, 0, s3 @@ -595,7 +593,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX12-LABEL: udiv_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_cvt_f32_u32 s4, s3 ; GFX12-NEXT: s_sub_co_i32 s5, 0, s3 @@ -694,19 +692,19 @@ main_body: define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ; GFX9-LABEL: atomic_add_local: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB5_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX9-NEXT: s_mul_i32 s0, s0, 5 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX9-NEXT: s_mul_i32 s1, s1, 5 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_add_u32 v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB5_2: @@ -714,19 +712,19 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ; ; GFX90A-LABEL: atomic_add_local: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB5_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dword s2, s[2:3], 0x24 +; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX90A-NEXT: s_mul_i32 s0, s0, 5 -; GFX90A-NEXT: v_mov_b32_e32 v1, s0 -; GFX90A-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX90A-NEXT: s_mul_i32 s1, s1, 5 +; GFX90A-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NEXT: ds_add_u32 v0, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: .LBB5_2: @@ -734,18 +732,18 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ; ; GFX10-LABEL: atomic_add_local: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s0, exec_lo -; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX10-NEXT: s_mov_b32 s2, exec_lo +; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX10-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX10-NEXT: s_cbranch_execz .LBB5_2 ; GFX10-NEXT: ; %bb.1: -; GFX10-NEXT: s_load_dword s1, s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bcnt1_i32_b32 s0, s0 -; GFX10-NEXT: s_mul_i32 s0, s0, 5 -; GFX10-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-NEXT: s_bcnt1_i32_b32 s1, s2 +; GFX10-NEXT: s_mul_i32 s1, s1, 5 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: ds_add_u32 v0, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv @@ -754,19 +752,19 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ; ; GFX9-FLATSCR-LABEL: atomic_add_local: ; GFX9-FLATSCR: ; %bb.0: -; GFX9-FLATSCR-NEXT: s_mov_b64 s[0:1], exec -; GFX9-FLATSCR-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX9-FLATSCR-NEXT: s_mov_b64 s[2:3], exec +; GFX9-FLATSCR-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-FLATSCR-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-FLATSCR-NEXT: s_cbranch_execz .LBB5_2 ; GFX9-FLATSCR-NEXT: ; %bb.1: -; GFX9-FLATSCR-NEXT: s_load_dword s2, s[2:3], 0x24 +; GFX9-FLATSCR-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-FLATSCR-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX9-FLATSCR-NEXT: s_mul_i32 s0, s0, 5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-FLATSCR-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX9-FLATSCR-NEXT: s_mul_i32 s1, s1, 5 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-FLATSCR-NEXT: ds_add_u32 v0, v1 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-NEXT: .LBB5_2: @@ -774,19 +772,19 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ; ; GFX11-LABEL: atomic_add_local: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_mov_b32 s1, exec_lo -; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX11-NEXT: s_mov_b32 s2, exec_lo +; GFX11-NEXT: s_mov_b32 s3, exec_lo +; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11-NEXT: s_cbranch_execz .LBB5_2 ; GFX11-NEXT: ; %bb.1: -; GFX11-NEXT: s_load_b32 s1, s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_bcnt1_i32_b32 s0, s0 +; GFX11-NEXT: s_bcnt1_i32_b32 s1, s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_mul_i32 s0, s0, 5 -; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v0, s1 +; GFX11-NEXT: s_mul_i32 s1, s1, 5 +; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0 ; GFX11-NEXT: ds_add_u32 v0, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv @@ -795,20 +793,19 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ; ; GFX12-LABEL: atomic_add_local: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_mov_b32 s0, exec_lo -; GFX12-NEXT: s_mov_b32 s1, exec_lo -; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX12-NEXT: s_mov_b32 s2, exec_lo +; GFX12-NEXT: s_mov_b32 s3, exec_lo +; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12-NEXT: s_cbranch_execz .LBB5_2 ; GFX12-NEXT: ; %bb.1: -; GFX12-NEXT: s_load_b32 s1, s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_bcnt1_i32_b32 s0, s0 +; GFX12-NEXT: s_bcnt1_i32_b32 s1, s2 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_mul_i32 s0, s0, 5 -; GFX12-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v0, s1 -; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: s_mul_i32 s1, s1, 5 +; GFX12-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0 ; GFX12-NEXT: ds_add_u32 v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE @@ -877,9 +874,8 @@ define void @flat_atomic_xchg_i32_noret(ptr %ptr, i32 %in) { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS +; GFX12-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -898,10 +894,10 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB7_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_mul_i32 s4, s4, 5 @@ -910,8 +906,8 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB7_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -927,10 +923,10 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: ; implicit-def: $vgpr1 -; GFX90A-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB7_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX90A-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX90A-NEXT: s_mul_i32 s4, s4, 5 @@ -939,8 +935,8 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX90A-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: .LBB7_2: -; GFX90A-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_readfirstlane_b32 s2, v1 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -951,26 +947,26 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; ; GFX10-LABEL: atomic_add_ret_local: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s1, exec_lo +; GFX10-NEXT: s_mov_b32 s3, exec_lo ; GFX10-NEXT: ; implicit-def: $vgpr1 -; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX10-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX10-NEXT: s_cbranch_execz .LBB7_2 ; GFX10-NEXT: ; %bb.1: -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bcnt1_i32_b32 s1, s1 -; GFX10-NEXT: s_mul_i32 s1, s1, 5 -; GFX10-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX10-NEXT: s_mul_i32 s3, s3, 5 +; GFX10-NEXT: v_mov_b32_e32 v2, s3 ; GFX10-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: .LBB7_2: ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -986,10 +982,10 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX9-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-FLATSCR-NEXT: ; implicit-def: $vgpr1 -; GFX9-FLATSCR-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-FLATSCR-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-FLATSCR-NEXT: s_cbranch_execz .LBB7_2 ; GFX9-FLATSCR-NEXT: ; %bb.1: -; GFX9-FLATSCR-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-FLATSCR-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-FLATSCR-NEXT: s_mul_i32 s4, s4, 5 @@ -998,8 +994,8 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX9-FLATSCR-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-NEXT: .LBB7_2: -; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0 @@ -1010,26 +1006,26 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: atomic_add_ret_local: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_mov_b32 s1, exec_lo -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX11-NEXT: s_mov_b32 s3, exec_lo +; GFX11-NEXT: s_mov_b32 s2, exec_lo +; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX11-NEXT: ; implicit-def: $vgpr1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11-NEXT: s_cbranch_execz .LBB7_2 ; GFX11-NEXT: ; %bb.1: -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX11-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_mul_i32 s1, s1, 5 -; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s4 +; GFX11-NEXT: s_mul_i32 s3, s3, 5 +; GFX11-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v1, s4 ; GFX11-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: .LBB7_2: -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 @@ -1041,27 +1037,26 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; ; GFX12-LABEL: atomic_add_ret_local: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_mov_b32 s1, exec_lo -; GFX12-NEXT: s_mov_b32 s0, exec_lo -; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX12-NEXT: s_mov_b32 s3, exec_lo +; GFX12-NEXT: s_mov_b32 s2, exec_lo +; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX12-NEXT: ; implicit-def: $vgpr1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12-NEXT: s_cbranch_execz .LBB7_2 ; GFX12-NEXT: ; %bb.1: -; GFX12-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX12-NEXT: s_load_b32 s4, s[0:1], 0x2c ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX12-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_mul_i32 s1, s1, 5 -; GFX12-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s4 -; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: s_mul_i32 s3, s3, 5 +; GFX12-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v1, s4 ; GFX12-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: .LBB7_2: -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 @@ -1088,10 +1083,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB8_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_mul_i32 s4, s4, 5 @@ -1099,8 +1094,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB8_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1116,10 +1111,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: ; implicit-def: $vgpr1 -; GFX90A-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB8_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX90A-NEXT: s_mul_i32 s4, s4, 5 @@ -1127,8 +1122,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX90A-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: .LBB8_2: -; GFX90A-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_readfirstlane_b32 s2, v1 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -1139,24 +1134,24 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX10-LABEL: add_i32_constant: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_mov_b32 s1, exec_lo +; GFX10-NEXT: s_mov_b32 s3, exec_lo ; GFX10-NEXT: ; implicit-def: $vgpr1 -; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX10-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX10-NEXT: s_cbranch_execz .LBB8_2 ; GFX10-NEXT: ; %bb.1: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bcnt1_i32_b32 s1, s1 -; GFX10-NEXT: s_mul_i32 s1, s1, 5 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX10-NEXT: s_mul_i32 s3, s3, 5 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB8_2: ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -1172,10 +1167,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-FLATSCR-NEXT: ; implicit-def: $vgpr1 -; GFX9-FLATSCR-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-FLATSCR-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-FLATSCR-NEXT: s_cbranch_execz .LBB8_2 ; GFX9-FLATSCR-NEXT: ; %bb.1: -; GFX9-FLATSCR-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX9-FLATSCR-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-FLATSCR-NEXT: s_mul_i32 s4, s4, 5 @@ -1183,8 +1178,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-FLATSCR-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: .LBB8_2: -; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0 @@ -1195,25 +1190,25 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX11-LABEL: add_i32_constant: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_mov_b32 s1, exec_lo -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX11-NEXT: s_mov_b32 s3, exec_lo +; GFX11-NEXT: s_mov_b32 s2, exec_lo +; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX11-NEXT: ; implicit-def: $vgpr1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11-NEXT: s_cbranch_execz .LBB8_2 ; GFX11-NEXT: ; %bb.1: -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX11-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_mul_i32 s1, s1, 5 -; GFX11-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-NEXT: s_mul_i32 s3, s3, 5 +; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], 0 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB8_2: -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 @@ -1225,25 +1220,25 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX12-LABEL: add_i32_constant: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_mov_b32 s1, exec_lo -; GFX12-NEXT: s_mov_b32 s0, exec_lo -; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX12-NEXT: s_mov_b32 s3, exec_lo +; GFX12-NEXT: s_mov_b32 s2, exec_lo +; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX12-NEXT: ; implicit-def: $vgpr1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12-NEXT: s_cbranch_execz .LBB8_2 ; GFX12-NEXT: ; %bb.1: -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX12-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_mul_i32 s1, s1, 5 -; GFX12-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-NEXT: s_mul_i32 s3, s3, 5 +; GFX12-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: .LBB8_2: -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 @@ -1660,3 +1655,4 @@ entry: %bc = bitcast <2 x i32> %r.1 to <2 x float> ret <2 x float> %bc } + diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll index f9073be7e260b..69f181fcede30 100644 --- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll @@ -8,11 +8,11 @@ define amdgpu_kernel void @i8_arg(ptr addrspace(1) nocapture %out, i8 %in) nounwind { ; SI-LABEL: i8_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_and_b32 s4, s2, 0xff ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -20,10 +20,10 @@ define amdgpu_kernel void @i8_arg(ptr addrspace(1) nocapture %out, i8 %in) nounw ; ; VI-LABEL: i8_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 0xff +; VI-NEXT: s_and_b32 s2, s2, 0xff ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -32,8 +32,8 @@ define amdgpu_kernel void @i8_arg(ptr addrspace(1) nocapture %out, i8 %in) nounw ; ; GFX9-LABEL: i8_arg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0xff @@ -80,11 +80,11 @@ define amdgpu_kernel void @i8_arg(ptr addrspace(1) nocapture %out, i8 %in) nounw define amdgpu_kernel void @i8_zext_arg(ptr addrspace(1) nocapture %out, i8 zeroext %in) nounwind { ; SI-LABEL: i8_zext_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_and_b32 s4, s2, 0xff ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -92,10 +92,10 @@ define amdgpu_kernel void @i8_zext_arg(ptr addrspace(1) nocapture %out, i8 zeroe ; ; VI-LABEL: i8_zext_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 0xff +; VI-NEXT: s_and_b32 s2, s2, 0xff ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -104,8 +104,8 @@ define amdgpu_kernel void @i8_zext_arg(ptr addrspace(1) nocapture %out, i8 zeroe ; ; GFX9-LABEL: i8_zext_arg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0xff @@ -155,11 +155,11 @@ define amdgpu_kernel void @i8_zext_arg(ptr addrspace(1) nocapture %out, i8 zeroe define amdgpu_kernel void @i8_sext_arg(ptr addrspace(1) nocapture %out, i8 signext %in) nounwind { ; SI-LABEL: i8_sext_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_sext_i32_i8 s4, s4 +; SI-NEXT: s_sext_i32_i8 s4, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -167,10 +167,10 @@ define amdgpu_kernel void @i8_sext_arg(ptr addrspace(1) nocapture %out, i8 signe ; ; VI-LABEL: i8_sext_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_sext_i32_i8 s2, s4 +; VI-NEXT: s_sext_i32_i8 s2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -179,8 +179,8 @@ define amdgpu_kernel void @i8_sext_arg(ptr addrspace(1) nocapture %out, i8 signe ; ; GFX9-LABEL: i8_sext_arg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sext_i32_i8 s2, s2 @@ -230,11 +230,11 @@ define amdgpu_kernel void @i8_sext_arg(ptr addrspace(1) nocapture %out, i8 signe define amdgpu_kernel void @i16_arg(ptr addrspace(1) nocapture %out, i16 %in) nounwind { ; SI-LABEL: i16_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_and_b32 s4, s2, 0xffff ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -242,10 +242,10 @@ define amdgpu_kernel void @i16_arg(ptr addrspace(1) nocapture %out, i16 %in) nou ; ; VI-LABEL: i16_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 0xffff +; VI-NEXT: s_and_b32 s2, s2, 0xffff ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -254,8 +254,8 @@ define amdgpu_kernel void @i16_arg(ptr addrspace(1) nocapture %out, i16 %in) nou ; ; GFX9-LABEL: i16_arg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0xffff @@ -302,11 +302,11 @@ define amdgpu_kernel void @i16_arg(ptr addrspace(1) nocapture %out, i16 %in) nou define amdgpu_kernel void @i16_zext_arg(ptr addrspace(1) nocapture %out, i16 zeroext %in) nounwind { ; SI-LABEL: i16_zext_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_and_b32 s4, s2, 0xffff ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -314,10 +314,10 @@ define amdgpu_kernel void @i16_zext_arg(ptr addrspace(1) nocapture %out, i16 zer ; ; VI-LABEL: i16_zext_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 0xffff +; VI-NEXT: s_and_b32 s2, s2, 0xffff ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -326,8 +326,8 @@ define amdgpu_kernel void @i16_zext_arg(ptr addrspace(1) nocapture %out, i16 zer ; ; GFX9-LABEL: i16_zext_arg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0xffff @@ -377,11 +377,11 @@ define amdgpu_kernel void @i16_zext_arg(ptr addrspace(1) nocapture %out, i16 zer define amdgpu_kernel void @i16_sext_arg(ptr addrspace(1) nocapture %out, i16 signext %in) nounwind { ; SI-LABEL: i16_sext_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_sext_i32_i16 s4, s4 +; SI-NEXT: s_sext_i32_i16 s4, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -389,10 +389,10 @@ define amdgpu_kernel void @i16_sext_arg(ptr addrspace(1) nocapture %out, i16 sig ; ; VI-LABEL: i16_sext_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_sext_i32_i16 s2, s4 +; VI-NEXT: s_sext_i32_i16 s2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -401,8 +401,8 @@ define amdgpu_kernel void @i16_sext_arg(ptr addrspace(1) nocapture %out, i16 sig ; ; GFX9-LABEL: i16_sext_arg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sext_i32_i16 s2, s2 @@ -452,8 +452,8 @@ define amdgpu_kernel void @i16_sext_arg(ptr addrspace(1) nocapture %out, i16 sig define amdgpu_kernel void @i32_arg(ptr addrspace(1) nocapture %out, i32 %in) nounwind { ; SI-LABEL: i32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -463,19 +463,19 @@ define amdgpu_kernel void @i32_arg(ptr addrspace(1) nocapture %out, i32 %in) nou ; ; VI-LABEL: i32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: i32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -511,8 +511,8 @@ entry: define amdgpu_kernel void @f32_arg(ptr addrspace(1) nocapture %out, float %in) nounwind { ; SI-LABEL: f32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -522,19 +522,19 @@ define amdgpu_kernel void @f32_arg(ptr addrspace(1) nocapture %out, float %in) n ; ; VI-LABEL: f32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: f32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -570,8 +570,8 @@ entry: define amdgpu_kernel void @v2i8_arg(ptr addrspace(1) %out, <2 x i8> %in) { ; SI-LABEL: v2i8_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -581,19 +581,19 @@ define amdgpu_kernel void @v2i8_arg(ptr addrspace(1) %out, <2 x i8> %in) { ; ; VI-LABEL: v2i8_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v2i8_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -659,8 +659,8 @@ entry: define amdgpu_kernel void @v2i16_arg(ptr addrspace(1) %out, <2 x i16> %in) { ; SI-LABEL: v2i16_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -670,19 +670,19 @@ define amdgpu_kernel void @v2i16_arg(ptr addrspace(1) %out, <2 x i16> %in) { ; ; VI-LABEL: v2i16_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v2i16_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -718,7 +718,7 @@ entry: define amdgpu_kernel void @v2i32_arg(ptr addrspace(1) nocapture %out, <2 x i32> %in) nounwind { ; SI-LABEL: v2i32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -731,7 +731,7 @@ define amdgpu_kernel void @v2i32_arg(ptr addrspace(1) nocapture %out, <2 x i32> ; ; VI-LABEL: v2i32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -742,7 +742,7 @@ define amdgpu_kernel void @v2i32_arg(ptr addrspace(1) nocapture %out, <2 x i32> ; ; GFX9-LABEL: v2i32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -781,7 +781,7 @@ entry: define amdgpu_kernel void @v2f32_arg(ptr addrspace(1) nocapture %out, <2 x float> %in) nounwind { ; SI-LABEL: v2f32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -794,7 +794,7 @@ define amdgpu_kernel void @v2f32_arg(ptr addrspace(1) nocapture %out, <2 x float ; ; VI-LABEL: v2f32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -805,7 +805,7 @@ define amdgpu_kernel void @v2f32_arg(ptr addrspace(1) nocapture %out, <2 x float ; ; GFX9-LABEL: v2f32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -844,8 +844,8 @@ entry: define amdgpu_kernel void @v3i8_arg(ptr addrspace(1) nocapture %out, <3 x i8> %in) nounwind { ; SI-LABEL: v3i8_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshr_b32 s5, s4, 16 @@ -858,26 +858,26 @@ define amdgpu_kernel void @v3i8_arg(ptr addrspace(1) nocapture %out, <3 x i8> %i ; ; VI-LABEL: v3i8_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s2, s4, 16 +; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_add_u32 s0, s0, 2 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v5, s2 +; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: flat_store_byte v[2:3], v5 ; VI-NEXT: flat_store_short v[0:1], v4 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v3i8_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -983,7 +983,7 @@ entry: define amdgpu_kernel void @v3i16_arg(ptr addrspace(1) nocapture %out, <3 x i16> %in) nounwind { ; SI-LABEL: v3i16_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -998,7 +998,7 @@ define amdgpu_kernel void @v3i16_arg(ptr addrspace(1) nocapture %out, <3 x i16> ; ; VI-LABEL: v3i16_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s0, 4 ; VI-NEXT: s_addc_u32 s5, s1, 0 @@ -1014,7 +1014,7 @@ define amdgpu_kernel void @v3i16_arg(ptr addrspace(1) nocapture %out, <3 x i16> ; ; GFX9-LABEL: v3i16_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 @@ -1102,8 +1102,8 @@ entry: define amdgpu_kernel void @v3i32_arg(ptr addrspace(1) nocapture %out, <3 x i32> %in) nounwind { ; SI-LABEL: v3i32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1117,8 +1117,8 @@ define amdgpu_kernel void @v3i32_arg(ptr addrspace(1) nocapture %out, <3 x i32> ; ; VI-LABEL: v3i32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v4, s1 @@ -1130,14 +1130,14 @@ define amdgpu_kernel void @v3i32_arg(ptr addrspace(1) nocapture %out, <3 x i32> ; ; GFX9-LABEL: v3i32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: global_store_dwordx3 v3, v[0:2], s[4:5] +; GFX9-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v3i32_arg: @@ -1181,8 +1181,8 @@ entry: define amdgpu_kernel void @v3f32_arg(ptr addrspace(1) nocapture %out, <3 x float> %in) nounwind { ; SI-LABEL: v3f32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1196,8 +1196,8 @@ define amdgpu_kernel void @v3f32_arg(ptr addrspace(1) nocapture %out, <3 x float ; ; VI-LABEL: v3f32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v4, s1 @@ -1209,14 +1209,14 @@ define amdgpu_kernel void @v3f32_arg(ptr addrspace(1) nocapture %out, <3 x float ; ; GFX9-LABEL: v3f32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: global_store_dwordx3 v3, v[0:2], s[4:5] +; GFX9-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v3f32_arg: @@ -1260,8 +1260,8 @@ entry: define amdgpu_kernel void @v4i8_arg(ptr addrspace(1) %out, <4 x i8> %in) { ; SI-LABEL: v4i8_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1271,19 +1271,19 @@ define amdgpu_kernel void @v4i8_arg(ptr addrspace(1) %out, <4 x i8> %in) { ; ; VI-LABEL: v4i8_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v4i8_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -1319,7 +1319,7 @@ entry: define amdgpu_kernel void @v4i16_arg(ptr addrspace(1) %out, <4 x i16> %in) { ; SI-LABEL: v4i16_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1332,7 +1332,7 @@ define amdgpu_kernel void @v4i16_arg(ptr addrspace(1) %out, <4 x i16> %in) { ; ; VI-LABEL: v4i16_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -1343,7 +1343,7 @@ define amdgpu_kernel void @v4i16_arg(ptr addrspace(1) %out, <4 x i16> %in) { ; ; GFX9-LABEL: v4i16_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -1382,8 +1382,8 @@ entry: define amdgpu_kernel void @v4i32_arg(ptr addrspace(1) nocapture %out, <4 x i32> %in) nounwind { ; SI-LABEL: v4i32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1396,8 +1396,8 @@ define amdgpu_kernel void @v4i32_arg(ptr addrspace(1) nocapture %out, <4 x i32> ; ; VI-LABEL: v4i32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1410,15 +1410,15 @@ define amdgpu_kernel void @v4i32_arg(ptr addrspace(1) nocapture %out, <4 x i32> ; ; GFX9-LABEL: v4i32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v4i32_arg: @@ -1456,8 +1456,8 @@ entry: define amdgpu_kernel void @v4f32_arg(ptr addrspace(1) nocapture %out, <4 x float> %in) nounwind { ; SI-LABEL: v4f32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1470,8 +1470,8 @@ define amdgpu_kernel void @v4f32_arg(ptr addrspace(1) nocapture %out, <4 x float ; ; VI-LABEL: v4f32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1484,15 +1484,15 @@ define amdgpu_kernel void @v4f32_arg(ptr addrspace(1) nocapture %out, <4 x float ; ; GFX9-LABEL: v4f32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v4f32_arg: @@ -1530,7 +1530,7 @@ entry: define amdgpu_kernel void @v5i8_arg(ptr addrspace(1) nocapture %out, <5 x i8> %in) nounwind { ; SI-LABEL: v5i8_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1545,7 +1545,7 @@ define amdgpu_kernel void @v5i8_arg(ptr addrspace(1) nocapture %out, <5 x i8> %i ; ; VI-LABEL: v5i8_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s0, 4 ; VI-NEXT: s_addc_u32 s5, s1, 0 @@ -1561,7 +1561,7 @@ define amdgpu_kernel void @v5i8_arg(ptr addrspace(1) nocapture %out, <5 x i8> %i ; ; GFX9-LABEL: v5i8_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 @@ -1671,50 +1671,50 @@ entry: define amdgpu_kernel void @v5i16_arg(ptr addrspace(1) nocapture %out, <5 x i16> %in) nounwind { ; SI-LABEL: v5i16_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s6, s[2:3], 0xf -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dword s2, s[0:1], 0xf +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:8 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v5i16_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s5, s[2:3], 0x3c -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s5, s[0:1], 0x3c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s4, s0, 8 +; VI-NEXT: s_add_u32 s4, s2, 8 ; VI-NEXT: v_mov_b32_e32 v4, s5 -; VI-NEXT: s_addc_u32 s5, s1, 0 +; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: flat_store_short v[2:3], v4 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v5i16_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_short v2, v3, s[4:5] offset:8 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: global_store_short v2, v3, s[6:7] offset:8 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v5i16_arg: @@ -1902,27 +1902,27 @@ entry: define amdgpu_kernel void @v5i32_arg(ptr addrspace(1) nocapture %out, <5 x i32> %in) nounwind { ; SI-LABEL: v5i32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s8, s[2:3], 0x15 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x11 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dword s8, s[0:1], 0x15 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x11 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: v_mov_b32_e32 v3, s3 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v5i32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s7, s[2:3], 0x54 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x44 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s7, s[0:1], 0x54 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x44 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s6, s4, 16 ; VI-NEXT: v_mov_b32_e32 v2, s7 @@ -1941,9 +1941,9 @@ define amdgpu_kernel void @v5i32_arg(ptr addrspace(1) nocapture %out, <5 x i32> ; ; GFX9-LABEL: v5i32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s8, s[6:7], 0x30 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x20 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s8, s[4:5], 0x30 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v5, s8 @@ -1951,8 +1951,8 @@ define amdgpu_kernel void @v5i32_arg(ptr addrspace(1) nocapture %out, <5 x i32> ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: global_store_dword v4, v5, s[4:5] offset:16 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] +; GFX9-NEXT: global_store_dword v4, v5, s[6:7] offset:16 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v5i32_arg: @@ -2000,27 +2000,27 @@ entry: define amdgpu_kernel void @v5f32_arg(ptr addrspace(1) nocapture %out, <5 x float> %in) nounwind { ; SI-LABEL: v5f32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s8, s[2:3], 0x15 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x11 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dword s8, s[0:1], 0x15 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x11 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: v_mov_b32_e32 v3, s3 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v5f32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s7, s[2:3], 0x54 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x44 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s7, s[0:1], 0x54 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x44 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s6, s4, 16 ; VI-NEXT: v_mov_b32_e32 v3, s7 @@ -2039,19 +2039,19 @@ define amdgpu_kernel void @v5f32_arg(ptr addrspace(1) nocapture %out, <5 x float ; ; GFX9-LABEL: v5f32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x20 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: s_load_dword s6, s[6:7], 0x30 +; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: global_store_dword v4, v0, s[4:5] offset:16 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: global_store_dword v4, v0, s[6:7] offset:16 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v5f32_arg: @@ -2099,34 +2099,34 @@ entry: define amdgpu_kernel void @v5i64_arg(ptr addrspace(1) nocapture %out, <5 x i64> %in) nounwind { ; SI-LABEL: v5i64_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x19 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x21 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x19 +; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x21 +; SI-NEXT: s_mov_b32 s15, 0xf000 +; SI-NEXT: s_mov_b32 s14, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: v_mov_b32_e32 v2, s10 ; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s12 -; SI-NEXT: v_mov_b32_e32 v1, s13 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:32 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[12:15], 0 offset:32 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v5i64_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[10:11], s[2:3], 0x84 -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x64 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x84 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x64 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s12, s8, 32 ; VI-NEXT: v_mov_b32_e32 v1, s10 @@ -2155,9 +2155,9 @@ define amdgpu_kernel void @v5i64_arg(ptr addrspace(1) nocapture %out, <5 x i64> ; ; GFX9-LABEL: v5i64_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x60 -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x40 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s1 @@ -2241,34 +2241,34 @@ entry: define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x double> %in) nounwind { ; SI-LABEL: v5f64_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x19 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x21 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x19 +; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x21 +; SI-NEXT: s_mov_b32 s15, 0xf000 +; SI-NEXT: s_mov_b32 s14, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: v_mov_b32_e32 v2, s10 ; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s12 -; SI-NEXT: v_mov_b32_e32 v1, s13 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:32 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[12:15], 0 offset:32 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v5f64_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[10:11], s[2:3], 0x84 -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x64 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x84 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x64 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s12, s8, 32 ; VI-NEXT: v_mov_b32_e32 v1, s10 @@ -2297,9 +2297,9 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x doubl ; ; GFX9-LABEL: v5f64_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x60 -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x40 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s1 @@ -2384,7 +2384,7 @@ entry: define amdgpu_kernel void @v8i8_arg(ptr addrspace(1) %out, <8 x i8> %in) { ; SI-LABEL: v8i8_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2397,7 +2397,7 @@ define amdgpu_kernel void @v8i8_arg(ptr addrspace(1) %out, <8 x i8> %in) { ; ; VI-LABEL: v8i8_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -2408,7 +2408,7 @@ define amdgpu_kernel void @v8i8_arg(ptr addrspace(1) %out, <8 x i8> %in) { ; ; GFX9-LABEL: v8i8_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -2635,8 +2635,8 @@ entry: define amdgpu_kernel void @v8i16_arg(ptr addrspace(1) %out, <8 x i16> %in) { ; SI-LABEL: v8i16_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2649,8 +2649,8 @@ define amdgpu_kernel void @v8i16_arg(ptr addrspace(1) %out, <8 x i16> %in) { ; ; VI-LABEL: v8i16_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2663,15 +2663,15 @@ define amdgpu_kernel void @v8i16_arg(ptr addrspace(1) %out, <8 x i16> %in) { ; ; GFX9-LABEL: v8i16_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v8i16_arg: @@ -2883,8 +2883,8 @@ entry: define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> %in) nounwind { ; SI-LABEL: v8i32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x11 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2903,8 +2903,8 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> ; ; VI-LABEL: v8i32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: s_add_u32 s2, s0, 16 @@ -2926,8 +2926,8 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> ; ; GFX9-LABEL: v8i32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x20 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s12 @@ -2994,8 +2994,8 @@ entry: define amdgpu_kernel void @v8f32_arg(ptr addrspace(1) nocapture %out, <8 x float> %in) nounwind { ; SI-LABEL: v8f32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x11 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3014,8 +3014,8 @@ define amdgpu_kernel void @v8f32_arg(ptr addrspace(1) nocapture %out, <8 x float ; ; VI-LABEL: v8f32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: s_add_u32 s2, s0, 16 @@ -3037,8 +3037,8 @@ define amdgpu_kernel void @v8f32_arg(ptr addrspace(1) nocapture %out, <8 x float ; ; GFX9-LABEL: v8f32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x20 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s12 @@ -3106,8 +3106,8 @@ entry: define amdgpu_kernel void @v16i8_arg(ptr addrspace(1) %out, <16 x i8> %in) { ; SI-LABEL: v16i8_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3120,8 +3120,8 @@ define amdgpu_kernel void @v16i8_arg(ptr addrspace(1) %out, <16 x i8> %in) { ; ; VI-LABEL: v16i8_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -3134,15 +3134,15 @@ define amdgpu_kernel void @v16i8_arg(ptr addrspace(1) %out, <16 x i8> %in) { ; ; GFX9-LABEL: v16i8_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v16i8_arg: @@ -3556,8 +3556,8 @@ entry: define amdgpu_kernel void @v16i16_arg(ptr addrspace(1) %out, <16 x i16> %in) { ; SI-LABEL: v16i16_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x11 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3576,8 +3576,8 @@ define amdgpu_kernel void @v16i16_arg(ptr addrspace(1) %out, <16 x i16> %in) { ; ; VI-LABEL: v16i16_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: s_add_u32 s2, s0, 16 @@ -3599,8 +3599,8 @@ define amdgpu_kernel void @v16i16_arg(ptr addrspace(1) %out, <16 x i16> %in) { ; ; GFX9-LABEL: v16i16_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x20 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s12 @@ -4012,8 +4012,8 @@ entry: define amdgpu_kernel void @v16i32_arg(ptr addrspace(1) nocapture %out, <16 x i32> %in) nounwind { ; SI-LABEL: v16i32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x19 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4044,8 +4044,8 @@ define amdgpu_kernel void @v16i32_arg(ptr addrspace(1) nocapture %out, <16 x i32 ; ; VI-LABEL: v16i32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: s_add_u32 s2, s0, 48 @@ -4085,8 +4085,8 @@ define amdgpu_kernel void @v16i32_arg(ptr addrspace(1) nocapture %out, <16 x i32 ; ; GFX9-LABEL: v16i32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x40 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s20 @@ -4200,8 +4200,8 @@ entry: define amdgpu_kernel void @v16f32_arg(ptr addrspace(1) nocapture %out, <16 x float> %in) nounwind { ; SI-LABEL: v16f32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x19 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4232,8 +4232,8 @@ define amdgpu_kernel void @v16f32_arg(ptr addrspace(1) nocapture %out, <16 x flo ; ; VI-LABEL: v16f32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: s_add_u32 s2, s0, 48 @@ -4273,8 +4273,8 @@ define amdgpu_kernel void @v16f32_arg(ptr addrspace(1) nocapture %out, <16 x flo ; ; GFX9-LABEL: v16f32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x40 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s20 @@ -4388,7 +4388,7 @@ entry: define amdgpu_kernel void @kernel_arg_i64(ptr addrspace(1) %out, i64 %a) nounwind { ; SI-LABEL: kernel_arg_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4401,7 +4401,7 @@ define amdgpu_kernel void @kernel_arg_i64(ptr addrspace(1) %out, i64 %a) nounwin ; ; VI-LABEL: kernel_arg_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -4412,7 +4412,7 @@ define amdgpu_kernel void @kernel_arg_i64(ptr addrspace(1) %out, i64 %a) nounwin ; ; GFX9-LABEL: kernel_arg_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -4450,7 +4450,7 @@ define amdgpu_kernel void @kernel_arg_i64(ptr addrspace(1) %out, i64 %a) nounwin define amdgpu_kernel void @f64_kernel_arg(ptr addrspace(1) %out, double %in) { ; SI-LABEL: f64_kernel_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4463,7 +4463,7 @@ define amdgpu_kernel void @f64_kernel_arg(ptr addrspace(1) %out, double %in) { ; ; VI-LABEL: f64_kernel_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -4474,7 +4474,7 @@ define amdgpu_kernel void @f64_kernel_arg(ptr addrspace(1) %out, double %in) { ; ; GFX9-LABEL: f64_kernel_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -4522,8 +4522,8 @@ entry: define amdgpu_kernel void @i65_arg(ptr addrspace(1) nocapture %out, i65 %in) nounwind { ; SI-LABEL: i65_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_and_b32 s8, s4, 1 @@ -4539,8 +4539,8 @@ define amdgpu_kernel void @i65_arg(ptr addrspace(1) nocapture %out, i65 %in) nou ; ; VI-LABEL: i65_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s4, s4, 1 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -4558,11 +4558,11 @@ define amdgpu_kernel void @i65_arg(ptr addrspace(1) nocapture %out, i65 %in) nou ; ; GFX9-LABEL: i65_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s4, s4, 1 +; GFX9-NEXT: s_and_b32 s4, s6, 1 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 @@ -4640,11 +4640,11 @@ entry: define amdgpu_kernel void @i1_arg(ptr addrspace(1) %out, i1 %x) nounwind { ; SI-LABEL: i1_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s4, s4, 1 +; SI-NEXT: s_and_b32 s4, s2, 1 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 @@ -4652,10 +4652,10 @@ define amdgpu_kernel void @i1_arg(ptr addrspace(1) %out, i1 %x) nounwind { ; ; VI-LABEL: i1_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 1 +; VI-NEXT: s_and_b32 s2, s2, 1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -4664,8 +4664,8 @@ define amdgpu_kernel void @i1_arg(ptr addrspace(1) %out, i1 %x) nounwind { ; ; GFX9-LABEL: i1_arg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 1 @@ -4731,11 +4731,11 @@ define amdgpu_kernel void @i1_arg(ptr addrspace(1) %out, i1 %x) nounwind { define amdgpu_kernel void @i1_arg_zext_i32(ptr addrspace(1) %out, i1 %x) nounwind { ; SI-LABEL: i1_arg_zext_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s4, s4, 1 +; SI-NEXT: s_and_b32 s4, s2, 1 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -4743,10 +4743,10 @@ define amdgpu_kernel void @i1_arg_zext_i32(ptr addrspace(1) %out, i1 %x) nounwin ; ; VI-LABEL: i1_arg_zext_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 1 +; VI-NEXT: s_and_b32 s2, s2, 1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -4755,8 +4755,8 @@ define amdgpu_kernel void @i1_arg_zext_i32(ptr addrspace(1) %out, i1 %x) nounwin ; ; GFX9-LABEL: i1_arg_zext_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 1 @@ -4803,8 +4803,8 @@ define amdgpu_kernel void @i1_arg_zext_i32(ptr addrspace(1) %out, i1 %x) nounwin define amdgpu_kernel void @i1_arg_zext_i64(ptr addrspace(1) %out, i1 %x) nounwind { ; SI-LABEL: i1_arg_zext_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4816,11 +4816,11 @@ define amdgpu_kernel void @i1_arg_zext_i64(ptr addrspace(1) %out, i1 %x) nounwin ; ; VI-LABEL: i1_arg_zext_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 1 +; VI-NEXT: s_and_b32 s2, s2, 1 ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v2, s0 @@ -4829,8 +4829,8 @@ define amdgpu_kernel void @i1_arg_zext_i64(ptr addrspace(1) %out, i1 %x) nounwin ; ; GFX9-LABEL: i1_arg_zext_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 1 @@ -4879,11 +4879,11 @@ define amdgpu_kernel void @i1_arg_zext_i64(ptr addrspace(1) %out, i1 %x) nounwin define amdgpu_kernel void @i1_arg_sext_i32(ptr addrspace(1) %out, i1 %x) nounwind { ; SI-LABEL: i1_arg_sext_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_i32 s4, s4, 0x10000 +; SI-NEXT: s_bfe_i32 s4, s2, 0x10000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -4891,10 +4891,10 @@ define amdgpu_kernel void @i1_arg_sext_i32(ptr addrspace(1) %out, i1 %x) nounwin ; ; VI-LABEL: i1_arg_sext_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bfe_i32 s2, s4, 0x10000 +; VI-NEXT: s_bfe_i32 s2, s2, 0x10000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -4903,8 +4903,8 @@ define amdgpu_kernel void @i1_arg_sext_i32(ptr addrspace(1) %out, i1 %x) nounwin ; ; GFX9-LABEL: i1_arg_sext_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_i32 s2, s2, 0x10000 @@ -4953,11 +4953,11 @@ define amdgpu_kernel void @i1_arg_sext_i32(ptr addrspace(1) %out, i1 %x) nounwin define amdgpu_kernel void @i1_arg_sext_i64(ptr addrspace(1) %out, i1 %x) nounwind { ; SI-LABEL: i1_arg_sext_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 +; SI-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x10000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -4966,21 +4966,21 @@ define amdgpu_kernel void @i1_arg_sext_i64(ptr addrspace(1) %out, i1 %x) nounwin ; ; VI-LABEL: i1_arg_sext_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x10000 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: i1_arg_sext_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x10000 @@ -5062,10 +5062,10 @@ define amdgpu_kernel void @empty_struct_arg({} %in) nounwind { define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, i64} %arg1) { ; SI-LABEL: struct_argument_alignment: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s8, s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xb -; SI-NEXT: s_load_dword s9, s[2:3], 0xf -; SI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x11 +; SI-NEXT: s_load_dword s8, s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_load_dword s9, s[0:1], 0xf +; SI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x11 ; SI-NEXT: s_mov_b32 s0, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -5089,46 +5089,46 @@ define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, ; ; VI-LABEL: struct_argument_alignment: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; VI-NEXT: s_load_dword s5, s[2:3], 0x3c -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x44 +; VI-NEXT: s_load_dword s4, s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dword s5, s[0:1], 0x3c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: struct_argument_alignment: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 -; GFX9-NEXT: s_load_dword s5, s[6:7], 0x18 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x20 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX9-NEXT: s_load_dword s7, s[4:5], 0x18 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x20 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s2 @@ -5196,7 +5196,6 @@ define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) { ; SI-LABEL: packed_struct_argument_alignment: ; SI: ; %bb.0: -; SI-NEXT: s_mov_b64 s[0:1], s[2:3] ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_load_dword s6, s[0:1], 0x9 @@ -5230,37 +5229,37 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, ; ; VI-LABEL: packed_struct_argument_alignment: ; VI: ; %bb.0: -; VI-NEXT: s_add_u32 s0, s2, 49 -; VI-NEXT: s_addc_u32 s1, s3, 0 -; VI-NEXT: s_add_u32 s4, s2, 50 -; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: s_add_u32 s0, s0, 3 -; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: s_add_u32 s0, s2, 51 -; VI-NEXT: s_addc_u32 s1, s3, 0 +; VI-NEXT: s_add_u32 s2, s0, 49 +; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: s_add_u32 s4, s0, 50 +; VI-NEXT: s_addc_u32 s5, s1, 0 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_add_u32 s2, s2, 3 +; VI-NEXT: s_addc_u32 s3, s3, 0 +; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: s_add_u32 s2, s0, 51 +; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v7, s1 +; VI-NEXT: v_mov_b32_e32 v7, s3 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v6, s0 +; VI-NEXT: v_mov_b32_e32 v6, s2 ; VI-NEXT: flat_load_ubyte v8, v[0:1] ; VI-NEXT: flat_load_ubyte v9, v[2:3] ; VI-NEXT: flat_load_ubyte v10, v[4:5] ; VI-NEXT: flat_load_ubyte v6, v[6:7] -; VI-NEXT: s_add_u32 s0, s2, 53 -; VI-NEXT: s_addc_u32 s1, s3, 0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_add_u32 s2, s0, 53 +; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: s_load_dword s4, s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x28 +; VI-NEXT: s_load_dword s2, s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x28 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: v_mov_b32_e32 v3, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v7, s4 +; VI-NEXT: v_mov_b32_e32 v7, s2 ; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dword v[2:3], v7 @@ -5281,10 +5280,10 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, ; GFX9-LABEL: packed_struct_argument_alignment: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_load_dword v6, v2, s[6:7] offset:13 -; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] offset:17 -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x4 +; GFX9-NEXT: global_load_dword v6, v2, s[4:5] offset:13 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:17 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -5380,11 +5379,11 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, {i32, i64} %arg2, i8, <4 x i32> %arg4) { ; SI-LABEL: struct_argument_alignment_after: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s12, s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xb -; SI-NEXT: s_load_dword s13, s[2:3], 0xf -; SI-NEXT: s_load_dwordx2 s[10:11], s[2:3], 0x11 -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x15 +; SI-NEXT: s_load_dword s12, s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb +; SI-NEXT: s_load_dword s13, s[0:1], 0xf +; SI-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x11 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x15 ; SI-NEXT: s_mov_b32 s4, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 @@ -5414,11 +5413,11 @@ define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, ; ; VI-LABEL: struct_argument_alignment_after: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s8, s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x2c -; VI-NEXT: s_load_dword s9, s[2:3], 0x3c -; VI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x44 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x54 +; VI-NEXT: s_load_dword s8, s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; VI-NEXT: s_load_dword s9, s[0:1], 0x3c +; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x44 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x54 ; VI-NEXT: v_mov_b32_e32 v4, 0 ; VI-NEXT: v_mov_b32_e32 v5, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5446,19 +5445,19 @@ define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, ; ; GFX9-LABEL: struct_argument_alignment_after: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s10, s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x8 -; GFX9-NEXT: s_load_dword s11, s[6:7], 0x18 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x20 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x30 +; GFX9-NEXT: s_load_dword s10, s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 +; GFX9-NEXT: s_load_dword s11, s[4:5], 0x18 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x20 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x30 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s10 ; GFX9-NEXT: global_store_dword v[4:5], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s11 @@ -5546,7 +5545,7 @@ define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) { ; SI-LABEL: array_3xi32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -5566,7 +5565,7 @@ define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) { ; ; VI-LABEL: array_3xi32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -5584,7 +5583,7 @@ define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) { ; ; GFX9-LABEL: array_3xi32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 @@ -5660,8 +5659,7 @@ define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) { define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) { ; SI-LABEL: array_3xi16: ; SI: ; %bb.0: -; SI-NEXT: s_mov_b64 s[0:1], s[2:3] -; SI-NEXT: s_load_dword s4, s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:42 @@ -5681,22 +5679,22 @@ define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) { ; ; VI-LABEL: array_3xi16: ; VI: ; %bb.0: -; VI-NEXT: s_add_u32 s0, s2, 38 -; VI-NEXT: s_addc_u32 s1, s3, 0 -; VI-NEXT: s_add_u32 s4, s0, 2 -; VI-NEXT: s_addc_u32 s5, s1, 0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_add_u32 s0, s2, 42 -; VI-NEXT: s_addc_u32 s1, s3, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_add_u32 s2, s0, 38 +; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: s_add_u32 s4, s2, 2 +; VI-NEXT: s_addc_u32 s5, s3, 0 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_add_u32 s2, s0, 42 +; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_load_ushort v4, v[0:1] ; VI-NEXT: flat_load_ushort v2, v[2:3] ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_ushort v0, v[0:1] -; VI-NEXT: s_load_dword s0, s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -5713,10 +5711,10 @@ define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) { ; GFX9-LABEL: array_3xi16: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] offset:6 -; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] offset:4 -; GFX9-NEXT: global_load_ushort v3, v0, s[6:7] offset:2 -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-NEXT: global_load_ushort v1, v0, s[4:5] offset:6 +; GFX9-NEXT: global_load_ushort v2, v0, s[4:5] offset:4 +; GFX9-NEXT: global_load_ushort v3, v0, s[4:5] offset:2 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_waitcnt vmcnt(2) @@ -5831,7 +5829,6 @@ define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) { define amdgpu_kernel void @small_array_round_down_offset(i8, [1 x i8] %arg) { ; SI-LABEL: small_array_round_down_offset: ; SI: ; %bb.0: -; SI-NEXT: s_mov_b64 s[0:1], s[2:3] ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:37 @@ -5842,8 +5839,8 @@ define amdgpu_kernel void @small_array_round_down_offset(i8, [1 x i8] %arg) { ; ; VI-LABEL: small_array_round_down_offset: ; VI: ; %bb.0: -; VI-NEXT: s_add_u32 s0, s2, 37 -; VI-NEXT: s_addc_u32 s1, s3, 0 +; VI-NEXT: s_add_u32 s0, s0, 37 +; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_load_ubyte v0, v[0:1] @@ -5855,7 +5852,7 @@ define amdgpu_kernel void @small_array_round_down_offset(i8, [1 x i8] %arg) { ; GFX9-LABEL: small_array_round_down_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_load_ubyte v0, v0, s[6:7] offset:1 +; GFX9-NEXT: global_load_ubyte v0, v0, s[4:5] offset:1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -5889,8 +5886,8 @@ define amdgpu_kernel void @small_array_round_down_offset(i8, [1 x i8] %arg) { define amdgpu_kernel void @byref_align_constant_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 %after.offset) { ; SI-LABEL: byref_align_constant_i32_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x49 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x49 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -5904,13 +5901,13 @@ define amdgpu_kernel void @byref_align_constant_i32_arg(ptr addrspace(1) nocaptu ; ; VI-LABEL: byref_align_constant_i32_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x124 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x124 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_store_dword v[0:1], v3 @@ -5919,8 +5916,8 @@ define amdgpu_kernel void @byref_align_constant_i32_arg(ptr addrspace(1) nocaptu ; ; GFX9-LABEL: byref_align_constant_i32_arg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x100 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s0 @@ -5973,83 +5970,83 @@ define amdgpu_kernel void @byref_align_constant_i32_arg(ptr addrspace(1) nocaptu define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(ptr addrspace(1) nocapture %out, i8, ptr addrspace(4) byref(<16 x i32>) align(64) %in.byref, i32 %after.offset) { ; SI-LABEL: byref_natural_align_constant_v16i32_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s20, s[2:3], 0x29 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x19 +; SI-NEXT: s_load_dwordx2 s[20:21], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0x29 +; SI-NEXT: s_mov_b32 s23, 0xf000 +; SI-NEXT: s_mov_b32 s22, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s16 ; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v2, s18 ; SI-NEXT: v_mov_b32_e32 v3, s19 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 offset:48 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s12 ; SI-NEXT: v_mov_b32_e32 v1, s13 ; SI-NEXT: v_mov_b32_e32 v2, s14 ; SI-NEXT: v_mov_b32_e32 v3, s15 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 offset:32 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: v_mov_b32_e32 v2, s10 ; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 offset:16 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: buffer_store_dword v0, off, s[20:23], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_endpgm ; ; VI-LABEL: byref_natural_align_constant_v16i32_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s20, s[2:3], 0xa4 +; VI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s20, s[0:1], 0xa4 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: s_add_u32 s2, s0, 48 -; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v5, s3 -; VI-NEXT: v_mov_b32_e32 v4, s2 -; VI-NEXT: s_add_u32 s2, s0, 32 +; VI-NEXT: s_add_u32 s0, s2, 48 +; VI-NEXT: s_addc_u32 s1, s3, 0 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: s_add_u32 s0, s2, 32 ; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 ; VI-NEXT: v_mov_b32_e32 v3, s19 -; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: s_addc_u32 s1, s3, 0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v5, s3 -; VI-NEXT: v_mov_b32_e32 v4, s2 -; VI-NEXT: s_add_u32 s2, s0, 16 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: s_add_u32 s0, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s12 ; VI-NEXT: v_mov_b32_e32 v1, s13 ; VI-NEXT: v_mov_b32_e32 v2, s14 ; VI-NEXT: v_mov_b32_e32 v3, s15 -; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: s_addc_u32 s1, s3, 0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: v_mov_b32_e32 v2, s10 ; VI-NEXT: v_mov_b32_e32 v3, s11 -; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s20 @@ -6059,9 +6056,9 @@ define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(ptr addrspace ; ; GFX9-LABEL: byref_natural_align_constant_v16i32_arg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x40 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x80 +; GFX9-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x80 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s20 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll index e1caf3bea6119..45a1afbf11992 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, float %x, float %y) #0 { ; SI-LABEL: s_cvt_pkrtz_v2f16_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -21,7 +21,7 @@ define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, float %x ; ; VI-LABEL: s_cvt_pkrtz_v2f16_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s3 ; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, s2, v0 @@ -32,26 +32,26 @@ define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, float %x ; ; GFX9-LABEL: s_cvt_pkrtz_v2f16_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, s2, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, s6, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_cvt_pkrtz_v2f16_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, s2, s3 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, s6, s7 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_cvt_pkrtz_v2f16_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, s2, s3 @@ -67,8 +67,8 @@ define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, float %x define amdgpu_kernel void @s_cvt_pkrtz_samereg_v2f16_f32(ptr addrspace(1) %out, float %x) #0 { ; SI-LABEL: s_cvt_pkrtz_samereg_v2f16_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -78,10 +78,10 @@ define amdgpu_kernel void @s_cvt_pkrtz_samereg_v2f16_f32(ptr addrspace(1) %out, ; ; VI-LABEL: s_cvt_pkrtz_samereg_v2f16_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, s4, s4 +; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, s2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -89,33 +89,33 @@ define amdgpu_kernel void @s_cvt_pkrtz_samereg_v2f16_f32(ptr addrspace(1) %out, ; ; GFX9-LABEL: s_cvt_pkrtz_samereg_v2f16_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, s4, s4 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_cvt_pkrtz_samereg_v2f16_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, s4, s4 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_cvt_pkrtz_samereg_v2f16_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, s4, s4 +; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, s2, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -145,8 +145,8 @@ define amdgpu_kernel void @s_cvt_pkrtz_undef_undef(ptr addrspace(1) %out) #0 { define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_cvt_pkrtz_v2f16_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -165,8 +165,8 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, ptr addr ; ; VI-LABEL: v_cvt_pkrtz_v2f16_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -188,13 +188,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -203,13 +203,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, ptr addr ; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e32 v1, v1, v2 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] @@ -218,10 +218,8 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, ptr addr ; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -248,7 +246,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 { ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -264,7 +262,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(ptr addrspace(1) %out, ; ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -281,7 +279,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -292,7 +290,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(ptr addrspace(1) %out, ; ; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc @@ -303,9 +301,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(ptr addrspace(1) %out, ; ; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -328,7 +324,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(ptr addrspace(1) %out, define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 { ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -344,7 +340,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(ptr addrspace(1) %out, ; ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -361,7 +357,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -372,7 +368,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(ptr addrspace(1) %out, ; ; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc @@ -383,9 +379,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(ptr addrspace(1) %out, ; ; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -408,8 +402,8 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(ptr addrspace(1) %out, define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -428,8 +422,8 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(ptr addrspace(1) %out, ; ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -451,13 +445,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, -v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -466,13 +460,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(ptr addrspace(1) %out, ; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, -v1, v2 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] @@ -481,10 +475,8 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(ptr addrspace(1) %out, ; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -512,8 +504,8 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(ptr addrspace(1) %out, define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -532,8 +524,8 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(ptr addrspace(1) %out, ; ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -555,13 +547,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, v1, -v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -570,13 +562,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(ptr addrspace(1) %out, ; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, v1, -v2 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] @@ -585,10 +577,8 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(ptr addrspace(1) %out, ; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -616,8 +606,8 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(ptr addrspace(1) %out, define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -636,8 +626,8 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(ptr addrspace(1) %ou ; ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -659,13 +649,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, -v1, -v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -674,13 +664,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(ptr addrspace(1) %ou ; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, -v1, -v2 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] @@ -689,10 +679,8 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(ptr addrspace(1) %ou ; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -721,8 +709,8 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(ptr addrspace(1) %ou define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -741,8 +729,8 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(ptr addrsp ; ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -764,13 +752,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(ptr addrsp ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, -|v1|, -v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -779,13 +767,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(ptr addrsp ; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, -|v1|, -v2 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] @@ -794,10 +782,8 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(ptr addrsp ; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll index a26b84e17374a..5d20a848bd6a6 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll @@ -15,7 +15,7 @@ declare half @llvm.fabs.f16(half) #0 define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float %src, float %a) { ; SDAG-GFX11-LABEL: v_fcmp_f32_oeq_with_fabs: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_eq_f32_e64 s2, s2, |s3| ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -27,17 +27,17 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float ; ; SDAG-GFX10-LABEL: v_fcmp_f32_oeq_with_fabs: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_eq_f32_e64 s2, s2, |s3| -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_eq_f32_e64 s0, s6, |s7| +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_oeq_with_fabs: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_eq_f32_e64 s2, s2, |s3| @@ -50,12 +50,12 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float ; ; GISEL-GFX10-LABEL: v_fcmp_f32_oeq_with_fabs: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_eq_f32_e64 s2, s2, |s3| -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_eq_f32_e64 s0, s6, |s7| +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GISEL-GFX10-NEXT: s_endpgm %temp = call float @llvm.fabs.f32(float %a) %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float %temp, i32 1) @@ -66,7 +66,7 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace(1) %out, float %src, float %a) { ; SDAG-GFX11-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_eq_f32_e64 s2, |s2|, |s3| ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -78,17 +78,17 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace( ; ; SDAG-GFX10-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_eq_f32_e64 s2, |s2|, |s3| -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_eq_f32_e64 s0, |s6|, |s7| +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_eq_f32_e64 s2, |s2|, |s3| @@ -101,12 +101,12 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace( ; ; GISEL-GFX10-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_eq_f32_e64 s2, |s2|, |s3| -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_eq_f32_e64 s0, |s6|, |s7| +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GISEL-GFX10-NEXT: s_endpgm %temp = call float @llvm.fabs.f32(float %a) %src_input = call float @llvm.fabs.f32(float %src) @@ -126,7 +126,7 @@ define amdgpu_kernel void @v_fcmp_f32(ptr addrspace(1) %out, float %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f32: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: global_store_b32 v0, v0, s[0:1] @@ -136,7 +136,7 @@ define amdgpu_kernel void @v_fcmp_f32(ptr addrspace(1) %out, float %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f32: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: global_store_dword v0, v0, s[0:1] @@ -150,10 +150,10 @@ define amdgpu_kernel void @v_fcmp_f32_oeq(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_oeq: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_eq_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX11-NEXT: v_cmp_eq_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -164,23 +164,23 @@ define amdgpu_kernel void @v_fcmp_f32_oeq(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_oeq: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_eq_f32_e64 s2, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_eq_f32_e64 s0, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_oeq: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_eq_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX11-NEXT: v_cmp_eq_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -191,13 +191,13 @@ define amdgpu_kernel void @v_fcmp_f32_oeq(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_oeq: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_eq_f32_e64 s2, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_eq_f32_e64 s0, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 1) store i32 %result, ptr addrspace(1) %out @@ -208,10 +208,10 @@ define amdgpu_kernel void @v_fcmp_f32_one(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_one: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -222,23 +222,23 @@ define amdgpu_kernel void @v_fcmp_f32_one(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_one: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_neq_f32_e64 s0, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_one: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -249,13 +249,13 @@ define amdgpu_kernel void @v_fcmp_f32_one(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_one: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_neq_f32_e64 s0, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 6) store i32 %result, ptr addrspace(1) %out @@ -266,10 +266,10 @@ define amdgpu_kernel void @v_fcmp_f32_ogt(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_ogt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_lt_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX11-NEXT: v_cmp_lt_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -280,23 +280,23 @@ define amdgpu_kernel void @v_fcmp_f32_ogt(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_ogt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_lt_f32_e64 s2, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_lt_f32_e64 s0, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_ogt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_lt_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX11-NEXT: v_cmp_lt_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -307,13 +307,13 @@ define amdgpu_kernel void @v_fcmp_f32_ogt(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_ogt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_lt_f32_e64 s2, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_lt_f32_e64 s0, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 2) store i32 %result, ptr addrspace(1) %out @@ -324,10 +324,10 @@ define amdgpu_kernel void @v_fcmp_f32_oge(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_oge: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_le_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX11-NEXT: v_cmp_le_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -338,23 +338,23 @@ define amdgpu_kernel void @v_fcmp_f32_oge(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_oge: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_le_f32_e64 s2, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_le_f32_e64 s0, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_oge: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_le_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX11-NEXT: v_cmp_le_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -365,13 +365,13 @@ define amdgpu_kernel void @v_fcmp_f32_oge(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_oge: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_le_f32_e64 s2, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_le_f32_e64 s0, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 3) store i32 %result, ptr addrspace(1) %out @@ -382,10 +382,10 @@ define amdgpu_kernel void @v_fcmp_f32_olt(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_olt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_gt_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX11-NEXT: v_cmp_gt_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -396,23 +396,23 @@ define amdgpu_kernel void @v_fcmp_f32_olt(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_olt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_gt_f32_e64 s2, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_gt_f32_e64 s0, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_olt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_gt_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX11-NEXT: v_cmp_gt_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -423,13 +423,13 @@ define amdgpu_kernel void @v_fcmp_f32_olt(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_olt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_gt_f32_e64 s2, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_gt_f32_e64 s0, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 4) store i32 %result, ptr addrspace(1) %out @@ -440,10 +440,10 @@ define amdgpu_kernel void @v_fcmp_f32_ole(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_ole: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_ge_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX11-NEXT: v_cmp_ge_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -454,23 +454,23 @@ define amdgpu_kernel void @v_fcmp_f32_ole(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_ole: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ge_f32_e64 s2, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_ge_f32_e64 s0, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_ole: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_ge_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX11-NEXT: v_cmp_ge_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -481,13 +481,13 @@ define amdgpu_kernel void @v_fcmp_f32_ole(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_ole: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ge_f32_e64 s2, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_ge_f32_e64 s0, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 5) store i32 %result, ptr addrspace(1) %out @@ -498,10 +498,10 @@ define amdgpu_kernel void @v_fcmp_f32_o(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_o: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_o_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX11-NEXT: v_cmp_o_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -512,23 +512,23 @@ define amdgpu_kernel void @v_fcmp_f32_o(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_o: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_o_f32_e64 s2, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_o_f32_e64 s0, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_o: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_o_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX11-NEXT: v_cmp_o_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -539,13 +539,13 @@ define amdgpu_kernel void @v_fcmp_f32_o(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_o: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_o_f32_e64 s2, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_o_f32_e64 s0, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 7) store i32 %result, ptr addrspace(1) %out @@ -556,10 +556,10 @@ define amdgpu_kernel void @v_fcmp_f32_uo(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_uo: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_u_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX11-NEXT: v_cmp_u_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -570,23 +570,23 @@ define amdgpu_kernel void @v_fcmp_f32_uo(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_uo: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_u_f32_e64 s2, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_u_f32_e64 s0, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_uo: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_u_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX11-NEXT: v_cmp_u_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -597,13 +597,13 @@ define amdgpu_kernel void @v_fcmp_f32_uo(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_uo: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_u_f32_e64 s2, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_u_f32_e64 s0, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 8) store i32 %result, ptr addrspace(1) %out @@ -614,10 +614,10 @@ define amdgpu_kernel void @v_fcmp_f32_ueq(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_ueq: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_nlg_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX11-NEXT: v_cmp_nlg_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -628,23 +628,23 @@ define amdgpu_kernel void @v_fcmp_f32_ueq(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_ueq: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_nlg_f32_e64 s2, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_nlg_f32_e64 s0, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_ueq: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_nlg_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX11-NEXT: v_cmp_nlg_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -655,13 +655,13 @@ define amdgpu_kernel void @v_fcmp_f32_ueq(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_ueq: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_nlg_f32_e64 s2, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_nlg_f32_e64 s0, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 9) store i32 %result, ptr addrspace(1) %out @@ -672,10 +672,10 @@ define amdgpu_kernel void @v_fcmp_f32_une(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_une: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -686,23 +686,23 @@ define amdgpu_kernel void @v_fcmp_f32_une(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_une: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_neq_f32_e64 s0, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_une: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -713,13 +713,13 @@ define amdgpu_kernel void @v_fcmp_f32_une(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_une: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_neq_f32_e64 s0, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 14) store i32 %result, ptr addrspace(1) %out @@ -730,10 +730,10 @@ define amdgpu_kernel void @v_fcmp_f32_ugt(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_ugt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_nge_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX11-NEXT: v_cmp_nge_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -744,23 +744,23 @@ define amdgpu_kernel void @v_fcmp_f32_ugt(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_ugt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_nge_f32_e64 s2, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_nge_f32_e64 s0, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_ugt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_nge_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX11-NEXT: v_cmp_nge_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -771,13 +771,13 @@ define amdgpu_kernel void @v_fcmp_f32_ugt(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_ugt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_nge_f32_e64 s2, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_nge_f32_e64 s0, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 10) store i32 %result, ptr addrspace(1) %out @@ -788,10 +788,10 @@ define amdgpu_kernel void @v_fcmp_f32_uge(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_uge: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_ngt_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX11-NEXT: v_cmp_ngt_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -802,23 +802,23 @@ define amdgpu_kernel void @v_fcmp_f32_uge(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_uge: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ngt_f32_e64 s2, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_ngt_f32_e64 s0, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_uge: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_ngt_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX11-NEXT: v_cmp_ngt_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -829,13 +829,13 @@ define amdgpu_kernel void @v_fcmp_f32_uge(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_uge: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ngt_f32_e64 s2, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_ngt_f32_e64 s0, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 11) store i32 %result, ptr addrspace(1) %out @@ -846,10 +846,10 @@ define amdgpu_kernel void @v_fcmp_f32_ult(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_ult: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_nle_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX11-NEXT: v_cmp_nle_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -860,23 +860,23 @@ define amdgpu_kernel void @v_fcmp_f32_ult(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_ult: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_nle_f32_e64 s2, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_nle_f32_e64 s0, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_ult: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_nle_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX11-NEXT: v_cmp_nle_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -887,13 +887,13 @@ define amdgpu_kernel void @v_fcmp_f32_ult(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_ult: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_nle_f32_e64 s2, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_nle_f32_e64 s0, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 12) store i32 %result, ptr addrspace(1) %out @@ -904,10 +904,10 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_ule: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_nlt_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX11-NEXT: v_cmp_nlt_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -918,23 +918,23 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_ule: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_nlt_f32_e64 s2, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_nlt_f32_e64 s0, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_ule: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_nlt_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX11-NEXT: v_cmp_nlt_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -945,13 +945,13 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_ule: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_nlt_f32_e64 s2, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_nlt_f32_e64 s0, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 13) store i32 %result, ptr addrspace(1) %out @@ -961,7 +961,7 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) { define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_oeq: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_eq_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -973,7 +973,7 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_oeq: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_eq_f64_e64 s2, 0x40590000, s[2:3] @@ -983,7 +983,7 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_oeq: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_eq_f64_e64 s2, 0x40590000, s[2:3] @@ -996,7 +996,7 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_oeq: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_eq_f64_e64 s2, 0x40590000, s[2:3] @@ -1011,7 +1011,7 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_one: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1023,7 +1023,7 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_one: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3] @@ -1033,7 +1033,7 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_one: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3] @@ -1046,7 +1046,7 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_one: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3] @@ -1061,7 +1061,7 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_ogt: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_lt_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1073,7 +1073,7 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_ogt: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_lt_f64_e64 s2, 0x40590000, s[2:3] @@ -1083,7 +1083,7 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_ogt: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_lt_f64_e64 s2, 0x40590000, s[2:3] @@ -1096,7 +1096,7 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_ogt: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_lt_f64_e64 s2, 0x40590000, s[2:3] @@ -1111,7 +1111,7 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_oge: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_le_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1123,7 +1123,7 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_oge: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_le_f64_e64 s2, 0x40590000, s[2:3] @@ -1133,7 +1133,7 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_oge: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_le_f64_e64 s2, 0x40590000, s[2:3] @@ -1146,7 +1146,7 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_oge: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_le_f64_e64 s2, 0x40590000, s[2:3] @@ -1161,7 +1161,7 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_olt: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_gt_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1173,7 +1173,7 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_olt: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_gt_f64_e64 s2, 0x40590000, s[2:3] @@ -1183,7 +1183,7 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_olt: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_gt_f64_e64 s2, 0x40590000, s[2:3] @@ -1196,7 +1196,7 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_olt: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_gt_f64_e64 s2, 0x40590000, s[2:3] @@ -1211,7 +1211,7 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_ole: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_ge_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1223,7 +1223,7 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_ole: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_ge_f64_e64 s2, 0x40590000, s[2:3] @@ -1233,7 +1233,7 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_ole: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_ge_f64_e64 s2, 0x40590000, s[2:3] @@ -1246,7 +1246,7 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_ole: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_ge_f64_e64 s2, 0x40590000, s[2:3] @@ -1261,7 +1261,7 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_ueq: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_nlg_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1273,7 +1273,7 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_ueq: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_nlg_f64_e64 s2, 0x40590000, s[2:3] @@ -1283,7 +1283,7 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_ueq: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_nlg_f64_e64 s2, 0x40590000, s[2:3] @@ -1296,7 +1296,7 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_ueq: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_nlg_f64_e64 s2, 0x40590000, s[2:3] @@ -1311,7 +1311,7 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_o: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_o_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1323,7 +1323,7 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_o: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_o_f64_e64 s2, 0x40590000, s[2:3] @@ -1333,7 +1333,7 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_o: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_o_f64_e64 s2, 0x40590000, s[2:3] @@ -1346,7 +1346,7 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_o: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_o_f64_e64 s2, 0x40590000, s[2:3] @@ -1361,7 +1361,7 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_uo: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_u_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1373,7 +1373,7 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_uo: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_u_f64_e64 s2, 0x40590000, s[2:3] @@ -1383,7 +1383,7 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_uo: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_u_f64_e64 s2, 0x40590000, s[2:3] @@ -1396,7 +1396,7 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_uo: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_u_f64_e64 s2, 0x40590000, s[2:3] @@ -1411,7 +1411,7 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_une: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1423,7 +1423,7 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_une: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3] @@ -1433,7 +1433,7 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_une: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3] @@ -1446,7 +1446,7 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_une: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3] @@ -1461,7 +1461,7 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_ugt: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_nge_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1473,7 +1473,7 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_ugt: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_nge_f64_e64 s2, 0x40590000, s[2:3] @@ -1483,7 +1483,7 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_ugt: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_nge_f64_e64 s2, 0x40590000, s[2:3] @@ -1496,7 +1496,7 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_ugt: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_nge_f64_e64 s2, 0x40590000, s[2:3] @@ -1511,7 +1511,7 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_uge: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_ngt_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1523,7 +1523,7 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_uge: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_ngt_f64_e64 s2, 0x40590000, s[2:3] @@ -1533,7 +1533,7 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_uge: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_ngt_f64_e64 s2, 0x40590000, s[2:3] @@ -1546,7 +1546,7 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_uge: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_ngt_f64_e64 s2, 0x40590000, s[2:3] @@ -1561,7 +1561,7 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_ult: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_nle_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1573,7 +1573,7 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_ult: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_nle_f64_e64 s2, 0x40590000, s[2:3] @@ -1583,7 +1583,7 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_ult: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_nle_f64_e64 s2, 0x40590000, s[2:3] @@ -1596,7 +1596,7 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_ult: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_nle_f64_e64 s2, 0x40590000, s[2:3] @@ -1611,7 +1611,7 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_ule: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_nlt_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1623,7 +1623,7 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_ule: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_nlt_f64_e64 s2, 0x40590000, s[2:3] @@ -1633,7 +1633,7 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_ule: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_nlt_f64_e64 s2, 0x40590000, s[2:3] @@ -1646,7 +1646,7 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_ule: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_nlt_f64_e64 s2, 0x40590000, s[2:3] @@ -1663,12 +1663,12 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_with_fabs(ptr addrspace(1) %out, half ; SDAG-GFX11-LABEL: v_fcmp_f16_oeq_with_fabs: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: s_lshr_b32 s2, s4, 16 +; SDAG-GFX11-NEXT: s_lshr_b32 s3, s2, 16 ; SDAG-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; SDAG-GFX11-NEXT: v_cmp_eq_f16_e64 s2, s4, |s2| +; SDAG-GFX11-NEXT: v_cmp_eq_f16_e64 s2, s2, |s3| ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; SDAG-GFX11-NEXT: s_nop 0 @@ -1678,26 +1678,26 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_with_fabs(ptr addrspace(1) %out, half ; SDAG-GFX10-LABEL: v_fcmp_f16_oeq_with_fabs: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: s_lshr_b32 s2, s4, 16 -; SDAG-GFX10-NEXT: v_cmp_eq_f16_e64 s2, s4, |s2| -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: s_lshr_b32 s0, s4, 16 +; SDAG-GFX10-NEXT: v_cmp_eq_f16_e64 s0, s4, |s0| +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_oeq_with_fabs: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: s_lshr_b32 s2, s4, 16 +; GISEL-GFX11-NEXT: s_lshr_b32 s3, s2, 16 ; GISEL-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GISEL-GFX11-NEXT: v_cmp_eq_f16_e64 s2, s4, |s2| +; GISEL-GFX11-NEXT: v_cmp_eq_f16_e64 s2, s2, |s3| ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GISEL-GFX11-NEXT: s_nop 0 @@ -1707,14 +1707,14 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_with_fabs(ptr addrspace(1) %out, half ; GISEL-GFX10-LABEL: v_fcmp_f16_oeq_with_fabs: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: s_lshr_b32 s2, s4, 16 -; GISEL-GFX10-NEXT: v_cmp_eq_f16_e64 s2, s4, |s2| -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: s_lshr_b32 s0, s4, 16 +; GISEL-GFX10-NEXT: v_cmp_eq_f16_e64 s0, s4, |s0| +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %temp = call half @llvm.fabs.f16(half %a) %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half %temp, i32 1) @@ -1727,12 +1727,12 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_both_operands_with_fabs(ptr addrspace( ; SDAG-GFX11-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: s_lshr_b32 s2, s4, 16 +; SDAG-GFX11-NEXT: s_lshr_b32 s3, s2, 16 ; SDAG-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; SDAG-GFX11-NEXT: v_cmp_eq_f16_e64 s2, |s4|, |s2| +; SDAG-GFX11-NEXT: v_cmp_eq_f16_e64 s2, |s2|, |s3| ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; SDAG-GFX11-NEXT: s_nop 0 @@ -1742,26 +1742,26 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_both_operands_with_fabs(ptr addrspace( ; SDAG-GFX10-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: s_lshr_b32 s2, s4, 16 -; SDAG-GFX10-NEXT: v_cmp_eq_f16_e64 s2, |s4|, |s2| -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: s_lshr_b32 s0, s4, 16 +; SDAG-GFX10-NEXT: v_cmp_eq_f16_e64 s0, |s4|, |s0| +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: s_lshr_b32 s2, s4, 16 +; GISEL-GFX11-NEXT: s_lshr_b32 s3, s2, 16 ; GISEL-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GISEL-GFX11-NEXT: v_cmp_eq_f16_e64 s2, |s4|, |s2| +; GISEL-GFX11-NEXT: v_cmp_eq_f16_e64 s2, |s2|, |s3| ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GISEL-GFX11-NEXT: s_nop 0 @@ -1771,14 +1771,14 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_both_operands_with_fabs(ptr addrspace( ; GISEL-GFX10-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: s_lshr_b32 s2, s4, 16 -; GISEL-GFX10-NEXT: v_cmp_eq_f16_e64 s2, |s4|, |s2| -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: s_lshr_b32 s0, s4, 16 +; GISEL-GFX10-NEXT: v_cmp_eq_f16_e64 s0, |s4|, |s0| +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %temp = call half @llvm.fabs.f16(half %a) %src_input = call half @llvm.fabs.f16(half %src) @@ -1798,7 +1798,7 @@ define amdgpu_kernel void @v_fcmp_f16(ptr addrspace(1) %out, half %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f16: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: global_store_b32 v0, v0, s[0:1] @@ -1808,7 +1808,7 @@ define amdgpu_kernel void @v_fcmp_f16(ptr addrspace(1) %out, half %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f16: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: global_store_dword v0, v0, s[0:1] @@ -1823,10 +1823,10 @@ define amdgpu_kernel void @v_fcmp_f16_oeq(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_oeq: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_eq_f16_e64 s2, 0x5640, s4 +; SDAG-GFX11-NEXT: v_cmp_eq_f16_e64 s2, 0x5640, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1837,23 +1837,23 @@ define amdgpu_kernel void @v_fcmp_f16_oeq(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_oeq: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_eq_f16_e64 s2, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_eq_f16_e64 s0, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_oeq: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_eq_f16_e64 s2, 0x5640, s4 +; GISEL-GFX11-NEXT: v_cmp_eq_f16_e64 s2, 0x5640, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1864,13 +1864,13 @@ define amdgpu_kernel void @v_fcmp_f16_oeq(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_oeq: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_eq_f16_e64 s2, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_eq_f16_e64 s0, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 1) store i32 %result, ptr addrspace(1) %out @@ -1882,10 +1882,10 @@ define amdgpu_kernel void @v_fcmp_f16_one(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_one: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s4 +; SDAG-GFX11-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1896,23 +1896,23 @@ define amdgpu_kernel void @v_fcmp_f16_one(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_one: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_neq_f16_e64 s0, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_one: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s4 +; GISEL-GFX11-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1923,13 +1923,13 @@ define amdgpu_kernel void @v_fcmp_f16_one(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_one: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_neq_f16_e64 s0, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 6) store i32 %result, ptr addrspace(1) %out @@ -1941,10 +1941,10 @@ define amdgpu_kernel void @v_fcmp_f16_ogt(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_ogt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_lt_f16_e64 s2, 0x5640, s4 +; SDAG-GFX11-NEXT: v_cmp_lt_f16_e64 s2, 0x5640, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1955,23 +1955,23 @@ define amdgpu_kernel void @v_fcmp_f16_ogt(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_ogt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_lt_f16_e64 s2, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_lt_f16_e64 s0, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_ogt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_lt_f16_e64 s2, 0x5640, s4 +; GISEL-GFX11-NEXT: v_cmp_lt_f16_e64 s2, 0x5640, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1982,13 +1982,13 @@ define amdgpu_kernel void @v_fcmp_f16_ogt(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_ogt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_lt_f16_e64 s2, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_lt_f16_e64 s0, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 2) store i32 %result, ptr addrspace(1) %out @@ -2000,10 +2000,10 @@ define amdgpu_kernel void @v_fcmp_f16_oge(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_oge: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_le_f16_e64 s2, 0x5640, s4 +; SDAG-GFX11-NEXT: v_cmp_le_f16_e64 s2, 0x5640, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2014,23 +2014,23 @@ define amdgpu_kernel void @v_fcmp_f16_oge(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_oge: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_le_f16_e64 s2, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_le_f16_e64 s0, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_oge: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_le_f16_e64 s2, 0x5640, s4 +; GISEL-GFX11-NEXT: v_cmp_le_f16_e64 s2, 0x5640, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2041,13 +2041,13 @@ define amdgpu_kernel void @v_fcmp_f16_oge(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_oge: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_le_f16_e64 s2, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_le_f16_e64 s0, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 3) store i32 %result, ptr addrspace(1) %out @@ -2059,10 +2059,10 @@ define amdgpu_kernel void @v_fcmp_f16_olt(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_olt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_gt_f16_e64 s2, 0x5640, s4 +; SDAG-GFX11-NEXT: v_cmp_gt_f16_e64 s2, 0x5640, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2073,23 +2073,23 @@ define amdgpu_kernel void @v_fcmp_f16_olt(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_olt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_gt_f16_e64 s2, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_gt_f16_e64 s0, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_olt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_gt_f16_e64 s2, 0x5640, s4 +; GISEL-GFX11-NEXT: v_cmp_gt_f16_e64 s2, 0x5640, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2100,13 +2100,13 @@ define amdgpu_kernel void @v_fcmp_f16_olt(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_olt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_gt_f16_e64 s2, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_gt_f16_e64 s0, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 4) store i32 %result, ptr addrspace(1) %out @@ -2118,10 +2118,10 @@ define amdgpu_kernel void @v_fcmp_f16_ole(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_ole: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_ge_f16_e64 s2, 0x5640, s4 +; SDAG-GFX11-NEXT: v_cmp_ge_f16_e64 s2, 0x5640, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2132,23 +2132,23 @@ define amdgpu_kernel void @v_fcmp_f16_ole(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_ole: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ge_f16_e64 s2, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_ge_f16_e64 s0, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_ole: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_ge_f16_e64 s2, 0x5640, s4 +; GISEL-GFX11-NEXT: v_cmp_ge_f16_e64 s2, 0x5640, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2159,13 +2159,13 @@ define amdgpu_kernel void @v_fcmp_f16_ole(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_ole: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ge_f16_e64 s2, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_ge_f16_e64 s0, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 5) store i32 %result, ptr addrspace(1) %out @@ -2177,10 +2177,10 @@ define amdgpu_kernel void @v_fcmp_f16_ueq(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_ueq: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_nlg_f16_e64 s2, 0x5640, s4 +; SDAG-GFX11-NEXT: v_cmp_nlg_f16_e64 s2, 0x5640, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2191,23 +2191,23 @@ define amdgpu_kernel void @v_fcmp_f16_ueq(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_ueq: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_nlg_f16_e64 s2, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_nlg_f16_e64 s0, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_ueq: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_nlg_f16_e64 s2, 0x5640, s4 +; GISEL-GFX11-NEXT: v_cmp_nlg_f16_e64 s2, 0x5640, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2218,13 +2218,13 @@ define amdgpu_kernel void @v_fcmp_f16_ueq(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_ueq: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_nlg_f16_e64 s2, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_nlg_f16_e64 s0, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 9) store i32 %result, ptr addrspace(1) %out @@ -2236,10 +2236,10 @@ define amdgpu_kernel void @v_fcmp_f16_une(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_une: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s4 +; SDAG-GFX11-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2250,23 +2250,23 @@ define amdgpu_kernel void @v_fcmp_f16_une(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_une: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_neq_f16_e64 s0, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_une: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s4 +; GISEL-GFX11-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2277,13 +2277,13 @@ define amdgpu_kernel void @v_fcmp_f16_une(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_une: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_neq_f16_e64 s0, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 14) store i32 %result, ptr addrspace(1) %out @@ -2295,10 +2295,10 @@ define amdgpu_kernel void @v_fcmp_f16_ugt(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_ugt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_nge_f16_e64 s2, 0x5640, s4 +; SDAG-GFX11-NEXT: v_cmp_nge_f16_e64 s2, 0x5640, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2309,23 +2309,23 @@ define amdgpu_kernel void @v_fcmp_f16_ugt(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_ugt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_nge_f16_e64 s2, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_nge_f16_e64 s0, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_ugt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_nge_f16_e64 s2, 0x5640, s4 +; GISEL-GFX11-NEXT: v_cmp_nge_f16_e64 s2, 0x5640, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2336,13 +2336,13 @@ define amdgpu_kernel void @v_fcmp_f16_ugt(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_ugt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_nge_f16_e64 s2, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_nge_f16_e64 s0, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 10) store i32 %result, ptr addrspace(1) %out @@ -2354,10 +2354,10 @@ define amdgpu_kernel void @v_fcmp_f16_uge(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_uge: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_ngt_f16_e64 s2, 0x5640, s4 +; SDAG-GFX11-NEXT: v_cmp_ngt_f16_e64 s2, 0x5640, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2368,23 +2368,23 @@ define amdgpu_kernel void @v_fcmp_f16_uge(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_uge: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ngt_f16_e64 s2, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_ngt_f16_e64 s0, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_uge: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_ngt_f16_e64 s2, 0x5640, s4 +; GISEL-GFX11-NEXT: v_cmp_ngt_f16_e64 s2, 0x5640, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2395,13 +2395,13 @@ define amdgpu_kernel void @v_fcmp_f16_uge(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_uge: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ngt_f16_e64 s2, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_ngt_f16_e64 s0, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 11) store i32 %result, ptr addrspace(1) %out @@ -2413,10 +2413,10 @@ define amdgpu_kernel void @v_fcmp_f16_ult(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_ult: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_nle_f16_e64 s2, 0x5640, s4 +; SDAG-GFX11-NEXT: v_cmp_nle_f16_e64 s2, 0x5640, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2427,23 +2427,23 @@ define amdgpu_kernel void @v_fcmp_f16_ult(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_ult: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_nle_f16_e64 s2, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_nle_f16_e64 s0, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_ult: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_nle_f16_e64 s2, 0x5640, s4 +; GISEL-GFX11-NEXT: v_cmp_nle_f16_e64 s2, 0x5640, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2454,13 +2454,13 @@ define amdgpu_kernel void @v_fcmp_f16_ult(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_ult: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_nle_f16_e64 s2, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_nle_f16_e64 s0, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 12) store i32 %result, ptr addrspace(1) %out @@ -2471,10 +2471,10 @@ define amdgpu_kernel void @v_fcmp_f16_o(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_o: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_o_f16_e64 s2, 0x5640, s4 +; SDAG-GFX11-NEXT: v_cmp_o_f16_e64 s2, 0x5640, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2485,23 +2485,23 @@ define amdgpu_kernel void @v_fcmp_f16_o(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_o: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_o_f16_e64 s2, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_o_f16_e64 s0, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_o: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_o_f16_e64 s2, 0x5640, s4 +; GISEL-GFX11-NEXT: v_cmp_o_f16_e64 s2, 0x5640, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2512,13 +2512,13 @@ define amdgpu_kernel void @v_fcmp_f16_o(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_o: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_o_f16_e64 s2, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_o_f16_e64 s0, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 7) store i32 %result, ptr addrspace(1) %out @@ -2529,10 +2529,10 @@ define amdgpu_kernel void @v_fcmp_f16_uo(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_uo: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_u_f16_e64 s2, 0x5640, s4 +; SDAG-GFX11-NEXT: v_cmp_u_f16_e64 s2, 0x5640, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2543,23 +2543,23 @@ define amdgpu_kernel void @v_fcmp_f16_uo(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_uo: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_u_f16_e64 s2, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_u_f16_e64 s0, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_uo: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_u_f16_e64 s2, 0x5640, s4 +; GISEL-GFX11-NEXT: v_cmp_u_f16_e64 s2, 0x5640, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2570,13 +2570,13 @@ define amdgpu_kernel void @v_fcmp_f16_uo(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_uo: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_u_f16_e64 s2, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_u_f16_e64 s0, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 8) store i32 %result, ptr addrspace(1) %out @@ -2587,10 +2587,10 @@ define amdgpu_kernel void @v_fcmp_f16_ule(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_ule: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_nlt_f16_e64 s2, 0x5640, s4 +; SDAG-GFX11-NEXT: v_cmp_nlt_f16_e64 s2, 0x5640, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2601,23 +2601,23 @@ define amdgpu_kernel void @v_fcmp_f16_ule(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_ule: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_nlt_f16_e64 s2, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_nlt_f16_e64 s0, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_ule: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_nlt_f16_e64 s2, 0x5640, s4 +; GISEL-GFX11-NEXT: v_cmp_nlt_f16_e64 s2, 0x5640, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2628,13 +2628,13 @@ define amdgpu_kernel void @v_fcmp_f16_ule(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_ule: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_nlt_f16_e64 s2, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_nlt_f16_e64 s0, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 13) store i32 %result, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll index 7e78d8b05d09f..674fec1b865a6 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll @@ -16,7 +16,7 @@ declare half @llvm.fabs.f16(half) #0 define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float %src, float %a) { ; GFX11-LABEL: v_fcmp_f32_oeq_with_fabs: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, |s3| @@ -30,19 +30,19 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float ; ; GFX9-LABEL: v_fcmp_f32_oeq_with_fabs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, |v0| -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_cmp_eq_f32_e64 s[0:1], s6, |v0| +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_oeq_with_fabs: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s3 ; VI-SDAG-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, |v0| @@ -55,7 +55,7 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float ; ; VI-GISEL-LABEL: v_fcmp_f32_oeq_with_fabs: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s3 ; VI-GISEL-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, |v0| @@ -74,7 +74,7 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace(1) %out, float %src, float %a) { ; GFX11-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_f32_e64 s[2:3], |s2|, |s3| @@ -88,19 +88,19 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace( ; ; GFX9-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_cmp_eq_f32_e64 s[2:3], |s2|, |v0| -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_cmp_eq_f32_e64 s[0:1], |s6|, |v0| +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s3 ; VI-SDAG-NEXT: v_cmp_eq_f32_e64 s[2:3], |s2|, |v0| @@ -113,7 +113,7 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace( ; ; VI-GISEL-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s3 ; VI-GISEL-NEXT: v_cmp_eq_f32_e64 s[2:3], |s2|, |v0| @@ -137,7 +137,7 @@ define amdgpu_kernel void @v_fcmp_f32(ptr addrspace(1) %out, float %src) { ; ; GFX11-GISEL-LABEL: v_fcmp_f32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -151,7 +151,7 @@ define amdgpu_kernel void @v_fcmp_f32(ptr addrspace(1) %out, float %src) { ; ; GFX9-GISEL-LABEL: v_fcmp_f32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1] @@ -163,7 +163,7 @@ define amdgpu_kernel void @v_fcmp_f32(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 @@ -178,11 +178,11 @@ define amdgpu_kernel void @v_fcmp_f32_oeq(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_oeq: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_eq_f32_e64 s[2:3], 0x42c80000, s4 +; GFX11-NEXT: v_cmp_eq_f32_e64 s[2:3], 0x42c80000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -193,24 +193,24 @@ define amdgpu_kernel void @v_fcmp_f32_oeq(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_oeq: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_f32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_eq_f32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_oeq: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_eq_f32_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -220,11 +220,11 @@ define amdgpu_kernel void @v_fcmp_f32_oeq(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_oeq: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_eq_f32_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -240,11 +240,11 @@ define amdgpu_kernel void @v_fcmp_f32_one(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_one: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_neq_f32_e64 s[2:3], 0x42c80000, s4 +; GFX11-NEXT: v_cmp_neq_f32_e64 s[2:3], 0x42c80000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -255,24 +255,24 @@ define amdgpu_kernel void @v_fcmp_f32_one(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_one: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_neq_f32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_neq_f32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_one: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_neq_f32_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_neq_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -282,11 +282,11 @@ define amdgpu_kernel void @v_fcmp_f32_one(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_one: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_neq_f32_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_neq_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -302,11 +302,11 @@ define amdgpu_kernel void @v_fcmp_f32_ogt(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_ogt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_f32_e64 s[2:3], 0x42c80000, s4 +; GFX11-NEXT: v_cmp_lt_f32_e64 s[2:3], 0x42c80000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -317,24 +317,24 @@ define amdgpu_kernel void @v_fcmp_f32_ogt(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_ogt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_f32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_gt_f32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_ogt: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_gt_f32_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_gt_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -344,11 +344,11 @@ define amdgpu_kernel void @v_fcmp_f32_ogt(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_ogt: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_gt_f32_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_gt_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -364,11 +364,11 @@ define amdgpu_kernel void @v_fcmp_f32_oge(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_oge: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_le_f32_e64 s[2:3], 0x42c80000, s4 +; GFX11-NEXT: v_cmp_le_f32_e64 s[2:3], 0x42c80000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -379,24 +379,24 @@ define amdgpu_kernel void @v_fcmp_f32_oge(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_oge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_oge: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_ge_f32_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_ge_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -406,11 +406,11 @@ define amdgpu_kernel void @v_fcmp_f32_oge(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_oge: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_ge_f32_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_ge_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -426,11 +426,11 @@ define amdgpu_kernel void @v_fcmp_f32_olt(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_olt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_f32_e64 s[2:3], 0x42c80000, s4 +; GFX11-NEXT: v_cmp_gt_f32_e64 s[2:3], 0x42c80000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -441,24 +441,24 @@ define amdgpu_kernel void @v_fcmp_f32_olt(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_olt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_f32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_lt_f32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_olt: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -468,11 +468,11 @@ define amdgpu_kernel void @v_fcmp_f32_olt(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_olt: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -488,11 +488,11 @@ define amdgpu_kernel void @v_fcmp_f32_ole(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_ole: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_ge_f32_e64 s[2:3], 0x42c80000, s4 +; GFX11-NEXT: v_cmp_ge_f32_e64 s[2:3], 0x42c80000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -503,24 +503,24 @@ define amdgpu_kernel void @v_fcmp_f32_ole(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_ole: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_le_f32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_le_f32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_ole: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_le_f32_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_le_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -530,11 +530,11 @@ define amdgpu_kernel void @v_fcmp_f32_ole(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_ole: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_le_f32_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_le_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -550,11 +550,11 @@ define amdgpu_kernel void @v_fcmp_f32_o(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_o: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_o_f32_e64 s[2:3], 0x42c80000, s4 +; GFX11-NEXT: v_cmp_o_f32_e64 s[2:3], 0x42c80000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -565,24 +565,24 @@ define amdgpu_kernel void @v_fcmp_f32_o(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_o: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_o_f32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_o_f32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_o: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_o_f32_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_o_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -592,11 +592,11 @@ define amdgpu_kernel void @v_fcmp_f32_o(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_o: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_o_f32_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_o_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -612,11 +612,11 @@ define amdgpu_kernel void @v_fcmp_f32_uo(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_uo: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_u_f32_e64 s[2:3], 0x42c80000, s4 +; GFX11-NEXT: v_cmp_u_f32_e64 s[2:3], 0x42c80000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -627,24 +627,24 @@ define amdgpu_kernel void @v_fcmp_f32_uo(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_uo: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_u_f32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_u_f32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_uo: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_u_f32_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_u_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -654,11 +654,11 @@ define amdgpu_kernel void @v_fcmp_f32_uo(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_uo: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_u_f32_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_u_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -674,11 +674,11 @@ define amdgpu_kernel void @v_fcmp_f32_ueq(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_ueq: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nlg_f32_e64 s[2:3], 0x42c80000, s4 +; GFX11-NEXT: v_cmp_nlg_f32_e64 s[2:3], 0x42c80000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -689,24 +689,24 @@ define amdgpu_kernel void @v_fcmp_f32_ueq(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_ueq: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_nlg_f32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_nlg_f32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_ueq: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_nlg_f32_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_nlg_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -716,11 +716,11 @@ define amdgpu_kernel void @v_fcmp_f32_ueq(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_ueq: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_nlg_f32_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_nlg_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -736,11 +736,11 @@ define amdgpu_kernel void @v_fcmp_f32_une(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_une: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_neq_f32_e64 s[2:3], 0x42c80000, s4 +; GFX11-NEXT: v_cmp_neq_f32_e64 s[2:3], 0x42c80000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -751,24 +751,24 @@ define amdgpu_kernel void @v_fcmp_f32_une(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_une: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_neq_f32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_neq_f32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_une: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_neq_f32_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_neq_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -778,11 +778,11 @@ define amdgpu_kernel void @v_fcmp_f32_une(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_une: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_neq_f32_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_neq_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -798,11 +798,11 @@ define amdgpu_kernel void @v_fcmp_f32_ugt(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_ugt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nge_f32_e64 s[2:3], 0x42c80000, s4 +; GFX11-NEXT: v_cmp_nge_f32_e64 s[2:3], 0x42c80000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -813,24 +813,24 @@ define amdgpu_kernel void @v_fcmp_f32_ugt(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_ugt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_nle_f32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_nle_f32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_ugt: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_nle_f32_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_nle_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -840,11 +840,11 @@ define amdgpu_kernel void @v_fcmp_f32_ugt(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_ugt: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_nle_f32_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_nle_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -860,11 +860,11 @@ define amdgpu_kernel void @v_fcmp_f32_uge(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_uge: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_ngt_f32_e64 s[2:3], 0x42c80000, s4 +; GFX11-NEXT: v_cmp_ngt_f32_e64 s[2:3], 0x42c80000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -875,24 +875,24 @@ define amdgpu_kernel void @v_fcmp_f32_uge(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_uge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_nlt_f32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_nlt_f32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_uge: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_nlt_f32_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_nlt_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -902,11 +902,11 @@ define amdgpu_kernel void @v_fcmp_f32_uge(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_uge: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_nlt_f32_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_nlt_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -922,11 +922,11 @@ define amdgpu_kernel void @v_fcmp_f32_ult(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_ult: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nle_f32_e64 s[2:3], 0x42c80000, s4 +; GFX11-NEXT: v_cmp_nle_f32_e64 s[2:3], 0x42c80000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -937,24 +937,24 @@ define amdgpu_kernel void @v_fcmp_f32_ult(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_ult: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_nge_f32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_nge_f32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_ult: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_nge_f32_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_nge_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -964,11 +964,11 @@ define amdgpu_kernel void @v_fcmp_f32_ult(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_ult: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_nge_f32_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_nge_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -984,11 +984,11 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_ule: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nlt_f32_e64 s[2:3], 0x42c80000, s4 +; GFX11-NEXT: v_cmp_nlt_f32_e64 s[2:3], 0x42c80000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -999,24 +999,24 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_ule: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ngt_f32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_ngt_f32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_ule: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_ngt_f32_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_ngt_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -1026,11 +1026,11 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_ule: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_ngt_f32_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_ngt_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -1045,7 +1045,7 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) { define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_oeq: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1059,7 +1059,7 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_oeq: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1072,7 +1072,7 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_oeq: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1086,7 +1086,7 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_oeq: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1105,7 +1105,7 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_one: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_neq_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1119,7 +1119,7 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_one: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1132,7 +1132,7 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_one: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1146,7 +1146,7 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_one: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1165,7 +1165,7 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_ogt: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_lt_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1179,7 +1179,7 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_ogt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1192,7 +1192,7 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_ogt: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1206,7 +1206,7 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_ogt: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1225,7 +1225,7 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_oge: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_le_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1239,7 +1239,7 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_oge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1252,7 +1252,7 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_oge: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1266,7 +1266,7 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_oge: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1285,7 +1285,7 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_olt: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_gt_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1299,7 +1299,7 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_olt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1312,7 +1312,7 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_olt: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1326,7 +1326,7 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_olt: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1345,7 +1345,7 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_ole: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_ge_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1359,7 +1359,7 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_ole: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1372,7 +1372,7 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_ole: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1386,7 +1386,7 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_ole: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1405,7 +1405,7 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_ueq: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_nlg_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1419,7 +1419,7 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_ueq: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1432,7 +1432,7 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_ueq: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1446,7 +1446,7 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_ueq: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1465,7 +1465,7 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_o: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_o_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1479,7 +1479,7 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_o: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1492,7 +1492,7 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_o: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1506,7 +1506,7 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_o: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1525,7 +1525,7 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_uo: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_u_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1539,7 +1539,7 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_uo: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1552,7 +1552,7 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_uo: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1566,7 +1566,7 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_uo: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1585,7 +1585,7 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_une: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_neq_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1599,7 +1599,7 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_une: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1612,7 +1612,7 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_une: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1626,7 +1626,7 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_une: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1645,7 +1645,7 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_ugt: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_nge_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1659,7 +1659,7 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_ugt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1672,7 +1672,7 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_ugt: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1686,7 +1686,7 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_ugt: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1705,7 +1705,7 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_uge: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_ngt_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1719,7 +1719,7 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_uge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1732,7 +1732,7 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_uge: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1746,7 +1746,7 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_uge: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1765,7 +1765,7 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_ult: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_nle_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1779,7 +1779,7 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_ult: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1792,7 +1792,7 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_ult: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1806,7 +1806,7 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_ult: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1825,7 +1825,7 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_ule: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_nlt_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1839,7 +1839,7 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_ule: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1852,7 +1852,7 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_ule: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1866,7 +1866,7 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_ule: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1887,13 +1887,13 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_with_fabs(ptr addrspace(1) %out, half ; GFX11-LABEL: v_fcmp_f16_oeq_with_fabs: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s2, s4, 16 +; GFX11-NEXT: s_lshr_b32 s3, s2, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cmp_eq_f16_e64 s[2:3], s4, |s2| +; GFX11-NEXT: v_cmp_eq_f16_e64 s[2:3], s2, |s3| ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1904,26 +1904,26 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_with_fabs(ptr addrspace(1) %out, half ; ; GFX9-LABEL: v_fcmp_f16_oeq_with_fabs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s2, s4, 16 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_cmp_eq_f16_e64 s[2:3], s4, |v0| -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_lshr_b32 s0, s4, 16 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_cmp_eq_f16_e64 s[0:1], s4, |v0| +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_oeq_with_fabs: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: s_lshr_b32 s2, s4, 16 -; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; VI-SDAG-NEXT: v_cmp_eq_f16_e64 s[2:3], s4, |v0| +; VI-SDAG-NEXT: s_lshr_b32 s3, s2, 16 +; VI-SDAG-NEXT: v_mov_b32_e32 v0, s3 +; VI-SDAG-NEXT: v_cmp_eq_f16_e64 s[2:3], s2, |v0| ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -1933,12 +1933,12 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_with_fabs(ptr addrspace(1) %out, half ; ; VI-GISEL-LABEL: v_fcmp_f16_oeq_with_fabs: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: s_lshr_b32 s2, s4, 16 -; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_cmp_eq_f16_e64 s[2:3], s4, |v0| +; VI-GISEL-NEXT: s_lshr_b32 s3, s2, 16 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s3 +; VI-GISEL-NEXT: v_cmp_eq_f16_e64 s[2:3], s2, |v0| ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -1956,13 +1956,13 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_both_operands_with_fabs(ptr addrspace( ; GFX11-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s2, s4, 16 +; GFX11-NEXT: s_lshr_b32 s3, s2, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cmp_eq_f16_e64 s[2:3], |s4|, |s2| +; GFX11-NEXT: v_cmp_eq_f16_e64 s[2:3], |s2|, |s3| ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1973,26 +1973,26 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_both_operands_with_fabs(ptr addrspace( ; ; GFX9-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s2, s4, 16 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_cmp_eq_f16_e64 s[2:3], |s4|, |v0| -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_lshr_b32 s0, s4, 16 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_cmp_eq_f16_e64 s[0:1], |s4|, |v0| +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: s_lshr_b32 s2, s4, 16 -; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; VI-SDAG-NEXT: v_cmp_eq_f16_e64 s[2:3], |s4|, |v0| +; VI-SDAG-NEXT: s_lshr_b32 s3, s2, 16 +; VI-SDAG-NEXT: v_mov_b32_e32 v0, s3 +; VI-SDAG-NEXT: v_cmp_eq_f16_e64 s[2:3], |s2|, |v0| ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2002,12 +2002,12 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_both_operands_with_fabs(ptr addrspace( ; ; VI-GISEL-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: s_lshr_b32 s2, s4, 16 -; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_cmp_eq_f16_e64 s[2:3], |s4|, |v0| +; VI-GISEL-NEXT: s_lshr_b32 s3, s2, 16 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s3 +; VI-GISEL-NEXT: v_cmp_eq_f16_e64 s[2:3], |s2|, |v0| ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2028,7 +2028,7 @@ define amdgpu_kernel void @v_fcmp_f16(ptr addrspace(1) %out, half %src) { ; ; GFX11-GISEL-LABEL: v_fcmp_f16: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -2042,7 +2042,7 @@ define amdgpu_kernel void @v_fcmp_f16(ptr addrspace(1) %out, half %src) { ; ; GFX9-GISEL-LABEL: v_fcmp_f16: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1] @@ -2054,7 +2054,7 @@ define amdgpu_kernel void @v_fcmp_f16(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 @@ -2070,11 +2070,11 @@ define amdgpu_kernel void @v_fcmp_f16_oeq(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_oeq: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_eq_f16_e64 s[2:3], 0x5640, s4 +; GFX11-NEXT: v_cmp_eq_f16_e64 s[2:3], 0x5640, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2085,24 +2085,24 @@ define amdgpu_kernel void @v_fcmp_f16_oeq(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_oeq: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_f16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_eq_f16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_oeq: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_eq_f16_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_eq_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2112,11 +2112,11 @@ define amdgpu_kernel void @v_fcmp_f16_oeq(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_oeq: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_eq_f16_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_eq_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2133,11 +2133,11 @@ define amdgpu_kernel void @v_fcmp_f16_one(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_one: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_neq_f16_e64 s[2:3], 0x5640, s4 +; GFX11-NEXT: v_cmp_neq_f16_e64 s[2:3], 0x5640, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2148,24 +2148,24 @@ define amdgpu_kernel void @v_fcmp_f16_one(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_one: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_neq_f16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_neq_f16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_one: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_neq_f16_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_neq_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2175,11 +2175,11 @@ define amdgpu_kernel void @v_fcmp_f16_one(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_one: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_neq_f16_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_neq_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2196,11 +2196,11 @@ define amdgpu_kernel void @v_fcmp_f16_ogt(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_ogt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_f16_e64 s[2:3], 0x5640, s4 +; GFX11-NEXT: v_cmp_lt_f16_e64 s[2:3], 0x5640, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2211,24 +2211,24 @@ define amdgpu_kernel void @v_fcmp_f16_ogt(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_ogt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_f16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_gt_f16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_ogt: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_gt_f16_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_gt_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2238,11 +2238,11 @@ define amdgpu_kernel void @v_fcmp_f16_ogt(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_ogt: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_gt_f16_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_gt_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2259,11 +2259,11 @@ define amdgpu_kernel void @v_fcmp_f16_oge(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_oge: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_le_f16_e64 s[2:3], 0x5640, s4 +; GFX11-NEXT: v_cmp_le_f16_e64 s[2:3], 0x5640, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2274,24 +2274,24 @@ define amdgpu_kernel void @v_fcmp_f16_oge(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_oge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ge_f16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_ge_f16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_oge: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_ge_f16_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_ge_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2301,11 +2301,11 @@ define amdgpu_kernel void @v_fcmp_f16_oge(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_oge: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_ge_f16_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_ge_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2322,11 +2322,11 @@ define amdgpu_kernel void @v_fcmp_f16_olt(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_olt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_f16_e64 s[2:3], 0x5640, s4 +; GFX11-NEXT: v_cmp_gt_f16_e64 s[2:3], 0x5640, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2337,24 +2337,24 @@ define amdgpu_kernel void @v_fcmp_f16_olt(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_olt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_f16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_lt_f16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_olt: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_lt_f16_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_lt_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2364,11 +2364,11 @@ define amdgpu_kernel void @v_fcmp_f16_olt(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_olt: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_lt_f16_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_lt_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2385,11 +2385,11 @@ define amdgpu_kernel void @v_fcmp_f16_ole(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_ole: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_ge_f16_e64 s[2:3], 0x5640, s4 +; GFX11-NEXT: v_cmp_ge_f16_e64 s[2:3], 0x5640, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2400,24 +2400,24 @@ define amdgpu_kernel void @v_fcmp_f16_ole(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_ole: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_le_f16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_le_f16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_ole: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_le_f16_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_le_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2427,11 +2427,11 @@ define amdgpu_kernel void @v_fcmp_f16_ole(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_ole: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_le_f16_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_le_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2448,11 +2448,11 @@ define amdgpu_kernel void @v_fcmp_f16_ueq(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_ueq: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nlg_f16_e64 s[2:3], 0x5640, s4 +; GFX11-NEXT: v_cmp_nlg_f16_e64 s[2:3], 0x5640, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2463,24 +2463,24 @@ define amdgpu_kernel void @v_fcmp_f16_ueq(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_ueq: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_nlg_f16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_nlg_f16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_ueq: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_nlg_f16_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_nlg_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2490,11 +2490,11 @@ define amdgpu_kernel void @v_fcmp_f16_ueq(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_ueq: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_nlg_f16_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_nlg_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2511,11 +2511,11 @@ define amdgpu_kernel void @v_fcmp_f16_une(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_une: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_neq_f16_e64 s[2:3], 0x5640, s4 +; GFX11-NEXT: v_cmp_neq_f16_e64 s[2:3], 0x5640, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2526,24 +2526,24 @@ define amdgpu_kernel void @v_fcmp_f16_une(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_une: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_neq_f16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_neq_f16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_une: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_neq_f16_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_neq_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2553,11 +2553,11 @@ define amdgpu_kernel void @v_fcmp_f16_une(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_une: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_neq_f16_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_neq_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2574,11 +2574,11 @@ define amdgpu_kernel void @v_fcmp_f16_ugt(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_ugt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nge_f16_e64 s[2:3], 0x5640, s4 +; GFX11-NEXT: v_cmp_nge_f16_e64 s[2:3], 0x5640, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2589,24 +2589,24 @@ define amdgpu_kernel void @v_fcmp_f16_ugt(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_ugt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_nle_f16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_nle_f16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_ugt: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_nle_f16_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_nle_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2616,11 +2616,11 @@ define amdgpu_kernel void @v_fcmp_f16_ugt(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_ugt: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_nle_f16_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_nle_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2637,11 +2637,11 @@ define amdgpu_kernel void @v_fcmp_f16_uge(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_uge: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_ngt_f16_e64 s[2:3], 0x5640, s4 +; GFX11-NEXT: v_cmp_ngt_f16_e64 s[2:3], 0x5640, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2652,24 +2652,24 @@ define amdgpu_kernel void @v_fcmp_f16_uge(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_uge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_nlt_f16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_nlt_f16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_uge: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_nlt_f16_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_nlt_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2679,11 +2679,11 @@ define amdgpu_kernel void @v_fcmp_f16_uge(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_uge: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_nlt_f16_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_nlt_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2700,11 +2700,11 @@ define amdgpu_kernel void @v_fcmp_f16_ult(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_ult: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nle_f16_e64 s[2:3], 0x5640, s4 +; GFX11-NEXT: v_cmp_nle_f16_e64 s[2:3], 0x5640, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2715,24 +2715,24 @@ define amdgpu_kernel void @v_fcmp_f16_ult(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_ult: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_nge_f16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_nge_f16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_ult: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_nge_f16_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_nge_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2742,11 +2742,11 @@ define amdgpu_kernel void @v_fcmp_f16_ult(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_ult: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_nge_f16_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_nge_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2762,11 +2762,11 @@ define amdgpu_kernel void @v_fcmp_f16_o(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_o: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_o_f16_e64 s[2:3], 0x5640, s4 +; GFX11-NEXT: v_cmp_o_f16_e64 s[2:3], 0x5640, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2777,24 +2777,24 @@ define amdgpu_kernel void @v_fcmp_f16_o(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_o: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_o_f16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_o_f16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_o: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_o_f16_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_o_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2804,11 +2804,11 @@ define amdgpu_kernel void @v_fcmp_f16_o(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_o: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_o_f16_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_o_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2824,11 +2824,11 @@ define amdgpu_kernel void @v_fcmp_f16_uo(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_uo: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_u_f16_e64 s[2:3], 0x5640, s4 +; GFX11-NEXT: v_cmp_u_f16_e64 s[2:3], 0x5640, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2839,24 +2839,24 @@ define amdgpu_kernel void @v_fcmp_f16_uo(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_uo: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_u_f16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_u_f16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_uo: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_u_f16_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_u_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2866,11 +2866,11 @@ define amdgpu_kernel void @v_fcmp_f16_uo(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_uo: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_u_f16_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_u_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2886,11 +2886,11 @@ define amdgpu_kernel void @v_fcmp_f16_ule(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_ule: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nlt_f16_e64 s[2:3], 0x5640, s4 +; GFX11-NEXT: v_cmp_nlt_f16_e64 s[2:3], 0x5640, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2901,24 +2901,24 @@ define amdgpu_kernel void @v_fcmp_f16_ule(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_ule: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ngt_f16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_ngt_f16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_ule: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_ngt_f16_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_ngt_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2928,11 +2928,11 @@ define amdgpu_kernel void @v_fcmp_f16_ule(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_ule: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_ngt_f16_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_ngt_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.csub.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.csub.ll index d7dd0ce58a08f..4a66b761306f3 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.csub.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.csub.ll @@ -6,7 +6,7 @@ declare i32 @llvm.amdgcn.global.atomic.csub(ptr addrspace(1), i32) ; GCN-LABEL: {{^}}global_atomic_csub_rtn: ; PREGFX12: global_atomic_csub v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9:]+}}, s{{\[[0-9]+:[0-9]+\]}} glc -; GFX12PLUS: global_atomic_sub_clamp_u32 v0, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX12PLUS: global_atomic_sub_clamp_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} th:TH_ATOMIC_RETURN define amdgpu_kernel void @global_atomic_csub_rtn(ptr addrspace(1) %ptr, i32 %data) { main_body: %ret = call i32 @llvm.amdgcn.global.atomic.csub(ptr addrspace(1) %ptr, i32 %data) @@ -15,7 +15,7 @@ main_body: ; GCN-LABEL: {{^}}global_atomic_csub_no_rtn: ; PREGFX12: global_atomic_csub v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} -; GFX12PLUS: global_atomic_sub_clamp_u32 v0, v1, s[0:1] +; GFX12PLUS: global_atomic_sub_clamp_u32 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @global_atomic_csub_no_rtn(ptr addrspace(1) %ptr, i32 %data) #0 { main_body: %ret = call i32 @llvm.amdgcn.global.atomic.csub(ptr addrspace(1) %ptr, i32 %data) @@ -24,7 +24,7 @@ main_body: ; GCN-LABEL: {{^}}global_atomic_csub_off4_rtn: ; PREGFX12: global_atomic_csub v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:4 glc -; GFX12PLUS: global_atomic_sub_clamp_u32 v0, v0, v1, s[0:1] offset:4 th:TH_ATOMIC_RETURN +; GFX12PLUS: global_atomic_sub_clamp_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:4 th:TH_ATOMIC_RETURN define amdgpu_kernel void @global_atomic_csub_off4_rtn(ptr addrspace(1) %ptr, i32 %data) { main_body: %p = getelementptr i32, ptr addrspace(1) %ptr, i64 1 @@ -34,7 +34,7 @@ main_body: ; GCN-LABEL: {{^}}global_atomic_csub_off4_no_rtn: ; PREGFX12: global_atomic_csub v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:4 -; GFX12PLUS: global_atomic_sub_clamp_u32 v0, v1, s[0:1] offset:4 +; GFX12PLUS: global_atomic_sub_clamp_u32 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:4 define amdgpu_kernel void @global_atomic_csub_off4_no_rtn(ptr addrspace(1) %ptr, i32 %data) #0 { main_body: %p = getelementptr i32, ptr addrspace(1) %ptr, i64 1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll index 9e3e393d82e22..8fe85e49a4207 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll @@ -22,10 +22,10 @@ define amdgpu_kernel void @v_icmp_i32_eq(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX11-LABEL: v_icmp_i32_eq: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -36,23 +36,23 @@ define amdgpu_kernel void @v_icmp_i32_eq(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX10-LABEL: v_icmp_i32_eq: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i32_eq: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -63,13 +63,13 @@ define amdgpu_kernel void @v_icmp_i32_eq(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX10-LABEL: v_icmp_i32_eq: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 32) store i32 %result, ptr addrspace(1) %out @@ -87,7 +87,7 @@ define amdgpu_kernel void @v_icmp_i32(ptr addrspace(1) %out, i32 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_i32: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: global_store_b32 v0, v0, s[0:1] @@ -97,7 +97,7 @@ define amdgpu_kernel void @v_icmp_i32(ptr addrspace(1) %out, i32 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_i32: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: global_store_dword v0, v0, s[0:1] @@ -111,10 +111,10 @@ define amdgpu_kernel void @v_icmp_i32_ne(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX11-LABEL: v_icmp_i32_ne: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_ne_u32_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_ne_u32_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -125,23 +125,23 @@ define amdgpu_kernel void @v_icmp_i32_ne(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX10-LABEL: v_icmp_i32_ne: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ne_u32_e64 s2, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i32_ne: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_ne_u32_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_ne_u32_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -152,13 +152,13 @@ define amdgpu_kernel void @v_icmp_i32_ne(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX10-LABEL: v_icmp_i32_ne: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ne_u32_e64 s2, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 33) store i32 %result, ptr addrspace(1) %out @@ -169,10 +169,10 @@ define amdgpu_kernel void @v_icmp_i32_ugt(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX11-LABEL: v_icmp_i32_ugt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_lt_u32_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_lt_u32_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -183,23 +183,23 @@ define amdgpu_kernel void @v_icmp_i32_ugt(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX10-LABEL: v_icmp_i32_ugt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_lt_u32_e64 s2, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_lt_u32_e64 s0, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i32_ugt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_lt_u32_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_lt_u32_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -210,13 +210,13 @@ define amdgpu_kernel void @v_icmp_i32_ugt(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX10-LABEL: v_icmp_i32_ugt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_lt_u32_e64 s2, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_lt_u32_e64 s0, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 34) store i32 %result, ptr addrspace(1) %out @@ -227,10 +227,10 @@ define amdgpu_kernel void @v_icmp_i32_uge(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX11-LABEL: v_icmp_i32_uge: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_le_u32_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_le_u32_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -241,23 +241,23 @@ define amdgpu_kernel void @v_icmp_i32_uge(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX10-LABEL: v_icmp_i32_uge: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_le_u32_e64 s2, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_le_u32_e64 s0, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i32_uge: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_le_u32_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_le_u32_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -268,13 +268,13 @@ define amdgpu_kernel void @v_icmp_i32_uge(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX10-LABEL: v_icmp_i32_uge: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_le_u32_e64 s2, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_le_u32_e64 s0, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 35) store i32 %result, ptr addrspace(1) %out @@ -285,10 +285,10 @@ define amdgpu_kernel void @v_icmp_i32_ult(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX11-LABEL: v_icmp_i32_ult: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_gt_u32_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_gt_u32_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -299,23 +299,23 @@ define amdgpu_kernel void @v_icmp_i32_ult(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX10-LABEL: v_icmp_i32_ult: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_gt_u32_e64 s2, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_gt_u32_e64 s0, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i32_ult: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_gt_u32_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_gt_u32_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -326,13 +326,13 @@ define amdgpu_kernel void @v_icmp_i32_ult(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX10-LABEL: v_icmp_i32_ult: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_gt_u32_e64 s2, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_gt_u32_e64 s0, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 36) store i32 %result, ptr addrspace(1) %out @@ -343,10 +343,10 @@ define amdgpu_kernel void @v_icmp_i32_ule(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX11-LABEL: v_icmp_i32_ule: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_ge_u32_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_ge_u32_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -357,23 +357,23 @@ define amdgpu_kernel void @v_icmp_i32_ule(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX10-LABEL: v_icmp_i32_ule: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ge_u32_e64 s2, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_ge_u32_e64 s0, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i32_ule: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_ge_u32_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_ge_u32_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -384,13 +384,13 @@ define amdgpu_kernel void @v_icmp_i32_ule(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX10-LABEL: v_icmp_i32_ule: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ge_u32_e64 s2, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_ge_u32_e64 s0, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 37) store i32 %result, ptr addrspace(1) %out @@ -401,10 +401,10 @@ define amdgpu_kernel void @v_icmp_i32_sgt(ptr addrspace(1) %out, i32 %src) #1 { ; SDAG-GFX11-LABEL: v_icmp_i32_sgt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_lt_i32_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_lt_i32_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -415,23 +415,23 @@ define amdgpu_kernel void @v_icmp_i32_sgt(ptr addrspace(1) %out, i32 %src) #1 { ; SDAG-GFX10-LABEL: v_icmp_i32_sgt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_lt_i32_e64 s2, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_lt_i32_e64 s0, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i32_sgt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_lt_i32_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_lt_i32_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -442,13 +442,13 @@ define amdgpu_kernel void @v_icmp_i32_sgt(ptr addrspace(1) %out, i32 %src) #1 { ; GISEL-GFX10-LABEL: v_icmp_i32_sgt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_lt_i32_e64 s2, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_lt_i32_e64 s0, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 38) store i32 %result, ptr addrspace(1) %out @@ -459,10 +459,10 @@ define amdgpu_kernel void @v_icmp_i32_sge(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX11-LABEL: v_icmp_i32_sge: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_le_i32_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_le_i32_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -473,23 +473,23 @@ define amdgpu_kernel void @v_icmp_i32_sge(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX10-LABEL: v_icmp_i32_sge: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_le_i32_e64 s2, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_le_i32_e64 s0, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i32_sge: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_le_i32_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_le_i32_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -500,13 +500,13 @@ define amdgpu_kernel void @v_icmp_i32_sge(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX10-LABEL: v_icmp_i32_sge: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_le_i32_e64 s2, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_le_i32_e64 s0, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 39) store i32 %result, ptr addrspace(1) %out @@ -517,10 +517,10 @@ define amdgpu_kernel void @v_icmp_i32_slt(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX11-LABEL: v_icmp_i32_slt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_gt_i32_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_gt_i32_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -531,23 +531,23 @@ define amdgpu_kernel void @v_icmp_i32_slt(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX10-LABEL: v_icmp_i32_slt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_gt_i32_e64 s2, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_gt_i32_e64 s0, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i32_slt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_gt_i32_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_gt_i32_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -558,13 +558,13 @@ define amdgpu_kernel void @v_icmp_i32_slt(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX10-LABEL: v_icmp_i32_slt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_gt_i32_e64 s2, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_gt_i32_e64 s0, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 40) store i32 %result, ptr addrspace(1) %out @@ -575,10 +575,10 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX11-LABEL: v_icmp_i32_sle: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_ge_i32_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_ge_i32_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -589,23 +589,23 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX10-LABEL: v_icmp_i32_sle: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ge_i32_e64 s2, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_ge_i32_e64 s0, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i32_sle: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_ge_i32_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_ge_i32_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -616,13 +616,13 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX10-LABEL: v_icmp_i32_sle: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ge_i32_e64 s2, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_ge_i32_e64 s0, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 41) store i32 %result, ptr addrspace(1) %out @@ -632,7 +632,7 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) { define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_i64_eq: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_eq_u64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -644,7 +644,7 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_i64_eq: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_eq_u64_e64 s2, 0x64, s[2:3] @@ -654,7 +654,7 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_i64_eq: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_eq_u64_e64 s2, 0x64, s[2:3] @@ -667,7 +667,7 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_i64_eq: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_eq_u64_e64 s2, 0x64, s[2:3] @@ -682,7 +682,7 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_i64_ne: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_ne_u64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -694,7 +694,7 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_i64_ne: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_ne_u64_e64 s2, 0x64, s[2:3] @@ -704,7 +704,7 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_i64_ne: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_ne_u64_e64 s2, 0x64, s[2:3] @@ -717,7 +717,7 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_i64_ne: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_ne_u64_e64 s2, 0x64, s[2:3] @@ -732,7 +732,7 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_u64_ugt: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_lt_u64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -744,7 +744,7 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_u64_ugt: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_lt_u64_e64 s2, 0x64, s[2:3] @@ -754,7 +754,7 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_u64_ugt: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_lt_u64_e64 s2, 0x64, s[2:3] @@ -767,7 +767,7 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_u64_ugt: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_lt_u64_e64 s2, 0x64, s[2:3] @@ -782,7 +782,7 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_u64_uge: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_le_u64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -794,7 +794,7 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_u64_uge: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_le_u64_e64 s2, 0x64, s[2:3] @@ -804,7 +804,7 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_u64_uge: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_le_u64_e64 s2, 0x64, s[2:3] @@ -817,7 +817,7 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_u64_uge: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_le_u64_e64 s2, 0x64, s[2:3] @@ -832,7 +832,7 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_u64_ult: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_gt_u64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -844,7 +844,7 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_u64_ult: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_gt_u64_e64 s2, 0x64, s[2:3] @@ -854,7 +854,7 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_u64_ult: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_gt_u64_e64 s2, 0x64, s[2:3] @@ -867,7 +867,7 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_u64_ult: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_gt_u64_e64 s2, 0x64, s[2:3] @@ -882,7 +882,7 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_u64_ule: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_ge_u64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -894,7 +894,7 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_u64_ule: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_ge_u64_e64 s2, 0x64, s[2:3] @@ -904,7 +904,7 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_u64_ule: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_ge_u64_e64 s2, 0x64, s[2:3] @@ -917,7 +917,7 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_u64_ule: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_ge_u64_e64 s2, 0x64, s[2:3] @@ -932,7 +932,7 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_i64_sgt: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_lt_i64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -944,7 +944,7 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_i64_sgt: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_lt_i64_e64 s2, 0x64, s[2:3] @@ -954,7 +954,7 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_i64_sgt: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_lt_i64_e64 s2, 0x64, s[2:3] @@ -967,7 +967,7 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_i64_sgt: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_lt_i64_e64 s2, 0x64, s[2:3] @@ -982,7 +982,7 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_i64_sge: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_le_i64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -994,7 +994,7 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_i64_sge: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_le_i64_e64 s2, 0x64, s[2:3] @@ -1004,7 +1004,7 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_i64_sge: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_le_i64_e64 s2, 0x64, s[2:3] @@ -1017,7 +1017,7 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_i64_sge: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_le_i64_e64 s2, 0x64, s[2:3] @@ -1032,7 +1032,7 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_i64_slt: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_gt_i64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1044,7 +1044,7 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_i64_slt: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_gt_i64_e64 s2, 0x64, s[2:3] @@ -1054,7 +1054,7 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_i64_slt: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_gt_i64_e64 s2, 0x64, s[2:3] @@ -1067,7 +1067,7 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_i64_slt: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_gt_i64_e64 s2, 0x64, s[2:3] @@ -1082,7 +1082,7 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_i64_sle: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_ge_i64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1094,7 +1094,7 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_i64_sle: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_ge_i64_e64 s2, 0x64, s[2:3] @@ -1104,7 +1104,7 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_i64_sle: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_ge_i64_e64 s2, 0x64, s[2:3] @@ -1117,7 +1117,7 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_i64_sle: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_ge_i64_e64 s2, 0x64, s[2:3] @@ -1133,10 +1133,10 @@ define amdgpu_kernel void @v_icmp_i16_eq(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX11-LABEL: v_icmp_i16_eq: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_eq_u16_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_eq_u16_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1147,23 +1147,23 @@ define amdgpu_kernel void @v_icmp_i16_eq(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX10-LABEL: v_icmp_i16_eq: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_eq_u16_e64 s2, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_eq_u16_e64 s0, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i16_eq: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_eq_u16_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_eq_u16_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1174,13 +1174,13 @@ define amdgpu_kernel void @v_icmp_i16_eq(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX10-LABEL: v_icmp_i16_eq: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_eq_u16_e64 s2, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_eq_u16_e64 s0, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 32) store i32 %result, ptr addrspace(1) %out @@ -1198,7 +1198,7 @@ define amdgpu_kernel void @v_icmp_i16(ptr addrspace(1) %out, i16 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_i16: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: global_store_b32 v0, v0, s[0:1] @@ -1208,7 +1208,7 @@ define amdgpu_kernel void @v_icmp_i16(ptr addrspace(1) %out, i16 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_i16: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: global_store_dword v0, v0, s[0:1] @@ -1222,10 +1222,10 @@ define amdgpu_kernel void @v_icmp_i16_ne(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX11-LABEL: v_icmp_i16_ne: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_ne_u16_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_ne_u16_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1236,23 +1236,23 @@ define amdgpu_kernel void @v_icmp_i16_ne(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX10-LABEL: v_icmp_i16_ne: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ne_u16_e64 s2, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_ne_u16_e64 s0, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i16_ne: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_ne_u16_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_ne_u16_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1263,13 +1263,13 @@ define amdgpu_kernel void @v_icmp_i16_ne(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX10-LABEL: v_icmp_i16_ne: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ne_u16_e64 s2, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_ne_u16_e64 s0, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 33) store i32 %result, ptr addrspace(1) %out @@ -1280,10 +1280,10 @@ define amdgpu_kernel void @v_icmp_i16_ugt(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX11-LABEL: v_icmp_i16_ugt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_lt_u16_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_lt_u16_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1294,23 +1294,23 @@ define amdgpu_kernel void @v_icmp_i16_ugt(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX10-LABEL: v_icmp_i16_ugt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_lt_u16_e64 s2, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_lt_u16_e64 s0, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i16_ugt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_lt_u16_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_lt_u16_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1321,13 +1321,13 @@ define amdgpu_kernel void @v_icmp_i16_ugt(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX10-LABEL: v_icmp_i16_ugt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_lt_u16_e64 s2, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_lt_u16_e64 s0, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 34) store i32 %result, ptr addrspace(1) %out @@ -1338,10 +1338,10 @@ define amdgpu_kernel void @v_icmp_i16_uge(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX11-LABEL: v_icmp_i16_uge: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_le_u16_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_le_u16_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1352,23 +1352,23 @@ define amdgpu_kernel void @v_icmp_i16_uge(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX10-LABEL: v_icmp_i16_uge: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_le_u16_e64 s2, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_le_u16_e64 s0, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i16_uge: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_le_u16_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_le_u16_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1379,13 +1379,13 @@ define amdgpu_kernel void @v_icmp_i16_uge(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX10-LABEL: v_icmp_i16_uge: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_le_u16_e64 s2, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_le_u16_e64 s0, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 35) store i32 %result, ptr addrspace(1) %out @@ -1396,10 +1396,10 @@ define amdgpu_kernel void @v_icmp_i16_ult(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX11-LABEL: v_icmp_i16_ult: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_gt_u16_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_gt_u16_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1410,23 +1410,23 @@ define amdgpu_kernel void @v_icmp_i16_ult(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX10-LABEL: v_icmp_i16_ult: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_gt_u16_e64 s2, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_gt_u16_e64 s0, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i16_ult: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_gt_u16_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_gt_u16_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1437,13 +1437,13 @@ define amdgpu_kernel void @v_icmp_i16_ult(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX10-LABEL: v_icmp_i16_ult: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_gt_u16_e64 s2, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_gt_u16_e64 s0, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 36) store i32 %result, ptr addrspace(1) %out @@ -1454,10 +1454,10 @@ define amdgpu_kernel void @v_icmp_i16_ule(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX11-LABEL: v_icmp_i16_ule: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_ge_u16_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_ge_u16_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1468,23 +1468,23 @@ define amdgpu_kernel void @v_icmp_i16_ule(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX10-LABEL: v_icmp_i16_ule: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ge_u16_e64 s2, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_ge_u16_e64 s0, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i16_ule: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_ge_u16_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_ge_u16_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1495,13 +1495,13 @@ define amdgpu_kernel void @v_icmp_i16_ule(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX10-LABEL: v_icmp_i16_ule: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ge_u16_e64 s2, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_ge_u16_e64 s0, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 37) store i32 %result, ptr addrspace(1) %out @@ -1512,10 +1512,10 @@ define amdgpu_kernel void @v_icmp_i16_sgt(ptr addrspace(1) %out, i16 %src) #1 { ; SDAG-GFX11-LABEL: v_icmp_i16_sgt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_lt_i16_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_lt_i16_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1526,23 +1526,23 @@ define amdgpu_kernel void @v_icmp_i16_sgt(ptr addrspace(1) %out, i16 %src) #1 { ; SDAG-GFX10-LABEL: v_icmp_i16_sgt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_lt_i16_e64 s2, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_lt_i16_e64 s0, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i16_sgt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_lt_i16_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_lt_i16_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1553,13 +1553,13 @@ define amdgpu_kernel void @v_icmp_i16_sgt(ptr addrspace(1) %out, i16 %src) #1 { ; GISEL-GFX10-LABEL: v_icmp_i16_sgt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_lt_i16_e64 s2, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_lt_i16_e64 s0, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 38) store i32 %result, ptr addrspace(1) %out @@ -1570,10 +1570,10 @@ define amdgpu_kernel void @v_icmp_i16_sge(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX11-LABEL: v_icmp_i16_sge: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_le_i16_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_le_i16_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1584,23 +1584,23 @@ define amdgpu_kernel void @v_icmp_i16_sge(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX10-LABEL: v_icmp_i16_sge: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_le_i16_e64 s2, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_le_i16_e64 s0, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i16_sge: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_le_i16_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_le_i16_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1611,13 +1611,13 @@ define amdgpu_kernel void @v_icmp_i16_sge(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX10-LABEL: v_icmp_i16_sge: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_le_i16_e64 s2, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_le_i16_e64 s0, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 39) store i32 %result, ptr addrspace(1) %out @@ -1628,10 +1628,10 @@ define amdgpu_kernel void @v_icmp_i16_slt(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX11-LABEL: v_icmp_i16_slt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_gt_i16_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_gt_i16_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1642,23 +1642,23 @@ define amdgpu_kernel void @v_icmp_i16_slt(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX10-LABEL: v_icmp_i16_slt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_gt_i16_e64 s2, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_gt_i16_e64 s0, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i16_slt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_gt_i16_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_gt_i16_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1669,13 +1669,13 @@ define amdgpu_kernel void @v_icmp_i16_slt(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX10-LABEL: v_icmp_i16_slt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_gt_i16_e64 s2, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_gt_i16_e64 s0, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 40) store i32 %result, ptr addrspace(1) %out @@ -1686,10 +1686,10 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX11-LABEL: v_icmp_i16_sle: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_ge_i16_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_ge_i16_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1700,23 +1700,23 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX10-LABEL: v_icmp_i16_sle: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ge_i16_e64 s2, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_ge_i16_e64 s0, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i16_sle: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_ge_i16_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_ge_i16_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1727,13 +1727,13 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX10-LABEL: v_icmp_i16_sle: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ge_i16_e64 s2, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_ge_i16_e64 s0, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 41) store i32 %result, ptr addrspace(1) %out @@ -1743,7 +1743,7 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) { define amdgpu_kernel void @v_icmp_i1_ne0(ptr addrspace(1) %out, i32 %a, i32 %b) { ; GFX11-LABEL: v_icmp_i1_ne0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_cmp_gt_u32 s2, 1 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 @@ -1759,16 +1759,16 @@ define amdgpu_kernel void @v_icmp_i1_ne0(ptr addrspace(1) %out, i32 %a, i32 %b) ; ; GFX10-LABEL: v_icmp_i1_ne0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_cmp_gt_u32 s2, 1 -; GFX10-NEXT: s_cselect_b32 s2, -1, 0 -; GFX10-NEXT: s_cmp_gt_u32 s3, 2 -; GFX10-NEXT: s_cselect_b32 s3, -1, 0 -; GFX10-NEXT: s_and_b32 s2, s2, s3 -; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_cmp_gt_u32 s6, 1 +; GFX10-NEXT: s_cselect_b32 s0, -1, 0 +; GFX10-NEXT: s_cmp_gt_u32 s7, 2 +; GFX10-NEXT: s_cselect_b32 s1, -1, 0 +; GFX10-NEXT: s_and_b32 s0, s0, s1 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm %c0 = icmp ugt i32 %a, 1 %c1 = icmp ugt i32 %b, 2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll index 60e242bf5b0e8..a650f999835c6 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll @@ -25,11 +25,11 @@ define amdgpu_kernel void @v_icmp_i32_eq(ptr addrspace(1) %out, i32 %src) { ; GFX11-LABEL: v_icmp_i32_eq: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_eq_u32_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -40,11 +40,11 @@ define amdgpu_kernel void @v_icmp_i32_eq(ptr addrspace(1) %out, i32 %src) { ; ; SDAG-VI-LABEL: v_icmp_i32_eq: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_eq_u32_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -54,24 +54,24 @@ define amdgpu_kernel void @v_icmp_i32_eq(ptr addrspace(1) %out, i32 %src) { ; ; GFX9-LABEL: v_icmp_i32_eq: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i32_eq: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_eq_u32_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -98,7 +98,7 @@ define amdgpu_kernel void @v_icmp_i32(ptr addrspace(1) %out, i32 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_i32: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -108,7 +108,7 @@ define amdgpu_kernel void @v_icmp_i32(ptr addrspace(1) %out, i32 %src) { ; ; GISEL-VI-LABEL: v_icmp_i32: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -117,7 +117,7 @@ define amdgpu_kernel void @v_icmp_i32(ptr addrspace(1) %out, i32 %src) { ; ; GISEL-GFX9-LABEL: v_icmp_i32: ; GISEL-GFX9: ; %bb.0: -; GISEL-GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1] @@ -131,11 +131,11 @@ define amdgpu_kernel void @v_icmp_i32_ne(ptr addrspace(1) %out, i32 %src) { ; GFX11-LABEL: v_icmp_i32_ne: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_ne_u32_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -146,11 +146,11 @@ define amdgpu_kernel void @v_icmp_i32_ne(ptr addrspace(1) %out, i32 %src) { ; ; SDAG-VI-LABEL: v_icmp_i32_ne: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_ne_u32_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_ne_u32_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -160,24 +160,24 @@ define amdgpu_kernel void @v_icmp_i32_ne(ptr addrspace(1) %out, i32 %src) { ; ; GFX9-LABEL: v_icmp_i32_ne: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i32_ne: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_ne_u32_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_ne_u32_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -193,11 +193,11 @@ define amdgpu_kernel void @v_icmp_i32_ugt(ptr addrspace(1) %out, i32 %src) { ; GFX11-LABEL: v_icmp_i32_ugt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_u32_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_lt_u32_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -208,11 +208,11 @@ define amdgpu_kernel void @v_icmp_i32_ugt(ptr addrspace(1) %out, i32 %src) { ; ; SDAG-VI-LABEL: v_icmp_i32_ugt: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_gt_u32_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_gt_u32_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -222,24 +222,24 @@ define amdgpu_kernel void @v_icmp_i32_ugt(ptr addrspace(1) %out, i32 %src) { ; ; GFX9-LABEL: v_icmp_i32_ugt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_u32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_gt_u32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i32_ugt: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_gt_u32_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_gt_u32_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -255,11 +255,11 @@ define amdgpu_kernel void @v_icmp_i32_uge(ptr addrspace(1) %out, i32 %src) { ; GFX11-LABEL: v_icmp_i32_uge: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_le_u32_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_le_u32_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -270,11 +270,11 @@ define amdgpu_kernel void @v_icmp_i32_uge(ptr addrspace(1) %out, i32 %src) { ; ; SDAG-VI-LABEL: v_icmp_i32_uge: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_ge_u32_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_ge_u32_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -284,24 +284,24 @@ define amdgpu_kernel void @v_icmp_i32_uge(ptr addrspace(1) %out, i32 %src) { ; ; GFX9-LABEL: v_icmp_i32_uge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ge_u32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_ge_u32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i32_uge: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_ge_u32_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_ge_u32_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -317,11 +317,11 @@ define amdgpu_kernel void @v_icmp_i32_ult(ptr addrspace(1) %out, i32 %src) { ; GFX11-LABEL: v_icmp_i32_ult: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_u32_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_gt_u32_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -332,11 +332,11 @@ define amdgpu_kernel void @v_icmp_i32_ult(ptr addrspace(1) %out, i32 %src) { ; ; SDAG-VI-LABEL: v_icmp_i32_ult: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_lt_u32_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_lt_u32_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -346,24 +346,24 @@ define amdgpu_kernel void @v_icmp_i32_ult(ptr addrspace(1) %out, i32 %src) { ; ; GFX9-LABEL: v_icmp_i32_ult: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_u32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_lt_u32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i32_ult: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_lt_u32_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_lt_u32_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -379,11 +379,11 @@ define amdgpu_kernel void @v_icmp_i32_ule(ptr addrspace(1) %out, i32 %src) { ; GFX11-LABEL: v_icmp_i32_ule: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_ge_u32_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_ge_u32_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -394,11 +394,11 @@ define amdgpu_kernel void @v_icmp_i32_ule(ptr addrspace(1) %out, i32 %src) { ; ; SDAG-VI-LABEL: v_icmp_i32_ule: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_le_u32_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_le_u32_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -408,24 +408,24 @@ define amdgpu_kernel void @v_icmp_i32_ule(ptr addrspace(1) %out, i32 %src) { ; ; GFX9-LABEL: v_icmp_i32_ule: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i32_ule: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_le_u32_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_le_u32_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -441,11 +441,11 @@ define amdgpu_kernel void @v_icmp_i32_sgt(ptr addrspace(1) %out, i32 %src) #1 { ; GFX11-LABEL: v_icmp_i32_sgt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_i32_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_lt_i32_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -456,11 +456,11 @@ define amdgpu_kernel void @v_icmp_i32_sgt(ptr addrspace(1) %out, i32 %src) #1 { ; ; SDAG-VI-LABEL: v_icmp_i32_sgt: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_gt_i32_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_gt_i32_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -470,24 +470,24 @@ define amdgpu_kernel void @v_icmp_i32_sgt(ptr addrspace(1) %out, i32 %src) #1 { ; ; GFX9-LABEL: v_icmp_i32_sgt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_i32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_gt_i32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i32_sgt: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_gt_i32_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_gt_i32_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -503,11 +503,11 @@ define amdgpu_kernel void @v_icmp_i32_sge(ptr addrspace(1) %out, i32 %src) { ; GFX11-LABEL: v_icmp_i32_sge: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_le_i32_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_le_i32_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -518,11 +518,11 @@ define amdgpu_kernel void @v_icmp_i32_sge(ptr addrspace(1) %out, i32 %src) { ; ; SDAG-VI-LABEL: v_icmp_i32_sge: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_ge_i32_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_ge_i32_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -532,24 +532,24 @@ define amdgpu_kernel void @v_icmp_i32_sge(ptr addrspace(1) %out, i32 %src) { ; ; GFX9-LABEL: v_icmp_i32_sge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ge_i32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_ge_i32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i32_sge: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_ge_i32_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_ge_i32_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -565,11 +565,11 @@ define amdgpu_kernel void @v_icmp_i32_slt(ptr addrspace(1) %out, i32 %src) { ; GFX11-LABEL: v_icmp_i32_slt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_i32_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_gt_i32_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -580,11 +580,11 @@ define amdgpu_kernel void @v_icmp_i32_slt(ptr addrspace(1) %out, i32 %src) { ; ; SDAG-VI-LABEL: v_icmp_i32_slt: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_lt_i32_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_lt_i32_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -594,24 +594,24 @@ define amdgpu_kernel void @v_icmp_i32_slt(ptr addrspace(1) %out, i32 %src) { ; ; GFX9-LABEL: v_icmp_i32_slt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_i32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_lt_i32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i32_slt: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_lt_i32_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_lt_i32_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -627,11 +627,11 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) { ; GFX11-LABEL: v_icmp_i32_sle: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_ge_i32_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_ge_i32_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -642,11 +642,11 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) { ; ; SDAG-VI-LABEL: v_icmp_i32_sle: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_le_i32_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_le_i32_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -656,24 +656,24 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) { ; ; GFX9-LABEL: v_icmp_i32_sle: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_le_i32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_le_i32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i32_sle: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_le_i32_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_le_i32_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -688,7 +688,7 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) { define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_i64_eq: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_u64_e64 s[2:3], 0x64, s[2:3] @@ -702,7 +702,7 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-VI-LABEL: v_icmp_i64_eq: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -716,7 +716,7 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_i64_eq: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -729,7 +729,7 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-VI-LABEL: v_icmp_i64_eq: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -748,7 +748,7 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_i64_ne: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_ne_u64_e64 s[2:3], 0x64, s[2:3] @@ -762,7 +762,7 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-VI-LABEL: v_icmp_i64_ne: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -776,7 +776,7 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_i64_ne: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -789,7 +789,7 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-VI-LABEL: v_icmp_i64_ne: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -808,7 +808,7 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_u64_ugt: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_lt_u64_e64 s[2:3], 0x64, s[2:3] @@ -822,7 +822,7 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-VI-LABEL: v_icmp_u64_ugt: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -836,7 +836,7 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_u64_ugt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -849,7 +849,7 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-VI-LABEL: v_icmp_u64_ugt: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -868,7 +868,7 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_u64_uge: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_le_u64_e64 s[2:3], 0x64, s[2:3] @@ -882,7 +882,7 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-VI-LABEL: v_icmp_u64_uge: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -896,7 +896,7 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_u64_uge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -909,7 +909,7 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-VI-LABEL: v_icmp_u64_uge: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -928,7 +928,7 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_u64_ult: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_gt_u64_e64 s[2:3], 0x64, s[2:3] @@ -942,7 +942,7 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-VI-LABEL: v_icmp_u64_ult: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -956,7 +956,7 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_u64_ult: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -969,7 +969,7 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-VI-LABEL: v_icmp_u64_ult: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -988,7 +988,7 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_u64_ule: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_ge_u64_e64 s[2:3], 0x64, s[2:3] @@ -1002,7 +1002,7 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-VI-LABEL: v_icmp_u64_ule: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1016,7 +1016,7 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_u64_ule: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1029,7 +1029,7 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-VI-LABEL: v_icmp_u64_ule: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1048,7 +1048,7 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_i64_sgt: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_lt_i64_e64 s[2:3], 0x64, s[2:3] @@ -1062,7 +1062,7 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-VI-LABEL: v_icmp_i64_sgt: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1076,7 +1076,7 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_i64_sgt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1089,7 +1089,7 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-VI-LABEL: v_icmp_i64_sgt: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1108,7 +1108,7 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_i64_sge: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_le_i64_e64 s[2:3], 0x64, s[2:3] @@ -1122,7 +1122,7 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-VI-LABEL: v_icmp_i64_sge: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1136,7 +1136,7 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_i64_sge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1149,7 +1149,7 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-VI-LABEL: v_icmp_i64_sge: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1168,7 +1168,7 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_i64_slt: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_gt_i64_e64 s[2:3], 0x64, s[2:3] @@ -1182,7 +1182,7 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-VI-LABEL: v_icmp_i64_slt: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1196,7 +1196,7 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_i64_slt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1209,7 +1209,7 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-VI-LABEL: v_icmp_i64_slt: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1228,7 +1228,7 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_i64_sle: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_ge_i64_e64 s[2:3], 0x64, s[2:3] @@ -1242,7 +1242,7 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-VI-LABEL: v_icmp_i64_sle: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1256,7 +1256,7 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_i64_sle: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1269,7 +1269,7 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-VI-LABEL: v_icmp_i64_sle: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1289,11 +1289,11 @@ define amdgpu_kernel void @v_icmp_i16_eq(ptr addrspace(1) %out, i16 %src) { ; GFX11-LABEL: v_icmp_i16_eq: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_eq_u16_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_eq_u16_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1304,11 +1304,11 @@ define amdgpu_kernel void @v_icmp_i16_eq(ptr addrspace(1) %out, i16 %src) { ; ; SDAG-VI-LABEL: v_icmp_i16_eq: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_eq_u16_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_eq_u16_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1318,24 +1318,24 @@ define amdgpu_kernel void @v_icmp_i16_eq(ptr addrspace(1) %out, i16 %src) { ; ; GFX9-LABEL: v_icmp_i16_eq: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_u16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_eq_u16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i16_eq: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_eq_u16_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_eq_u16_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1362,7 +1362,7 @@ define amdgpu_kernel void @v_icmp_i16(ptr addrspace(1) %out, i16 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_i16: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1372,7 +1372,7 @@ define amdgpu_kernel void @v_icmp_i16(ptr addrspace(1) %out, i16 %src) { ; ; GISEL-VI-LABEL: v_icmp_i16: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1381,7 +1381,7 @@ define amdgpu_kernel void @v_icmp_i16(ptr addrspace(1) %out, i16 %src) { ; ; GISEL-GFX9-LABEL: v_icmp_i16: ; GISEL-GFX9: ; %bb.0: -; GISEL-GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1] @@ -1395,11 +1395,11 @@ define amdgpu_kernel void @v_icmp_i16_ne(ptr addrspace(1) %out, i16 %src) { ; GFX11-LABEL: v_icmp_i16_ne: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u16_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_ne_u16_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1410,11 +1410,11 @@ define amdgpu_kernel void @v_icmp_i16_ne(ptr addrspace(1) %out, i16 %src) { ; ; SDAG-VI-LABEL: v_icmp_i16_ne: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_ne_u16_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_ne_u16_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1424,24 +1424,24 @@ define amdgpu_kernel void @v_icmp_i16_ne(ptr addrspace(1) %out, i16 %src) { ; ; GFX9-LABEL: v_icmp_i16_ne: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_ne_u16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i16_ne: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_ne_u16_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_ne_u16_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1457,11 +1457,11 @@ define amdgpu_kernel void @v_icmp_i16_ugt(ptr addrspace(1) %out, i16 %src) { ; GFX11-LABEL: v_icmp_i16_ugt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_u16_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_lt_u16_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1472,11 +1472,11 @@ define amdgpu_kernel void @v_icmp_i16_ugt(ptr addrspace(1) %out, i16 %src) { ; ; SDAG-VI-LABEL: v_icmp_i16_ugt: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_gt_u16_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_gt_u16_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1486,24 +1486,24 @@ define amdgpu_kernel void @v_icmp_i16_ugt(ptr addrspace(1) %out, i16 %src) { ; ; GFX9-LABEL: v_icmp_i16_ugt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_u16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_gt_u16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i16_ugt: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_gt_u16_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_gt_u16_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1519,11 +1519,11 @@ define amdgpu_kernel void @v_icmp_i16_uge(ptr addrspace(1) %out, i16 %src) { ; GFX11-LABEL: v_icmp_i16_uge: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_le_u16_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_le_u16_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1534,11 +1534,11 @@ define amdgpu_kernel void @v_icmp_i16_uge(ptr addrspace(1) %out, i16 %src) { ; ; SDAG-VI-LABEL: v_icmp_i16_uge: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_ge_u16_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_ge_u16_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1548,24 +1548,24 @@ define amdgpu_kernel void @v_icmp_i16_uge(ptr addrspace(1) %out, i16 %src) { ; ; GFX9-LABEL: v_icmp_i16_uge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ge_u16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_ge_u16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i16_uge: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_ge_u16_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_ge_u16_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1581,11 +1581,11 @@ define amdgpu_kernel void @v_icmp_i16_ult(ptr addrspace(1) %out, i16 %src) { ; GFX11-LABEL: v_icmp_i16_ult: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_u16_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_gt_u16_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1596,11 +1596,11 @@ define amdgpu_kernel void @v_icmp_i16_ult(ptr addrspace(1) %out, i16 %src) { ; ; SDAG-VI-LABEL: v_icmp_i16_ult: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_lt_u16_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_lt_u16_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1610,24 +1610,24 @@ define amdgpu_kernel void @v_icmp_i16_ult(ptr addrspace(1) %out, i16 %src) { ; ; GFX9-LABEL: v_icmp_i16_ult: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_u16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_lt_u16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i16_ult: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_lt_u16_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_lt_u16_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1643,11 +1643,11 @@ define amdgpu_kernel void @v_icmp_i16_ule(ptr addrspace(1) %out, i16 %src) { ; GFX11-LABEL: v_icmp_i16_ule: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_ge_u16_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_ge_u16_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1658,11 +1658,11 @@ define amdgpu_kernel void @v_icmp_i16_ule(ptr addrspace(1) %out, i16 %src) { ; ; SDAG-VI-LABEL: v_icmp_i16_ule: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_le_u16_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_le_u16_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1672,24 +1672,24 @@ define amdgpu_kernel void @v_icmp_i16_ule(ptr addrspace(1) %out, i16 %src) { ; ; GFX9-LABEL: v_icmp_i16_ule: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_le_u16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_le_u16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i16_ule: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_le_u16_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_le_u16_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1705,11 +1705,11 @@ define amdgpu_kernel void @v_icmp_i16_sgt(ptr addrspace(1) %out, i16 %src) #1 { ; GFX11-LABEL: v_icmp_i16_sgt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_i16_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_lt_i16_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1720,11 +1720,11 @@ define amdgpu_kernel void @v_icmp_i16_sgt(ptr addrspace(1) %out, i16 %src) #1 { ; ; SDAG-VI-LABEL: v_icmp_i16_sgt: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_gt_i16_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_gt_i16_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1734,24 +1734,24 @@ define amdgpu_kernel void @v_icmp_i16_sgt(ptr addrspace(1) %out, i16 %src) #1 { ; ; GFX9-LABEL: v_icmp_i16_sgt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_i16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_gt_i16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i16_sgt: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_gt_i16_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_gt_i16_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1767,11 +1767,11 @@ define amdgpu_kernel void @v_icmp_i16_sge(ptr addrspace(1) %out, i16 %src) { ; GFX11-LABEL: v_icmp_i16_sge: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_le_i16_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_le_i16_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1782,11 +1782,11 @@ define amdgpu_kernel void @v_icmp_i16_sge(ptr addrspace(1) %out, i16 %src) { ; ; SDAG-VI-LABEL: v_icmp_i16_sge: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_ge_i16_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_ge_i16_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1796,24 +1796,24 @@ define amdgpu_kernel void @v_icmp_i16_sge(ptr addrspace(1) %out, i16 %src) { ; ; GFX9-LABEL: v_icmp_i16_sge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ge_i16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_ge_i16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i16_sge: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_ge_i16_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_ge_i16_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1829,11 +1829,11 @@ define amdgpu_kernel void @v_icmp_i16_slt(ptr addrspace(1) %out, i16 %src) { ; GFX11-LABEL: v_icmp_i16_slt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_i16_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_gt_i16_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1844,11 +1844,11 @@ define amdgpu_kernel void @v_icmp_i16_slt(ptr addrspace(1) %out, i16 %src) { ; ; SDAG-VI-LABEL: v_icmp_i16_slt: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_lt_i16_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_lt_i16_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1858,24 +1858,24 @@ define amdgpu_kernel void @v_icmp_i16_slt(ptr addrspace(1) %out, i16 %src) { ; ; GFX9-LABEL: v_icmp_i16_slt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_i16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_lt_i16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i16_slt: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_lt_i16_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_lt_i16_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1891,11 +1891,11 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) { ; GFX11-LABEL: v_icmp_i16_sle: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_ge_i16_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_ge_i16_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1906,11 +1906,11 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) { ; ; SDAG-VI-LABEL: v_icmp_i16_sle: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_le_i16_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_le_i16_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1920,24 +1920,24 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) { ; ; GFX9-LABEL: v_icmp_i16_sle: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_le_i16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_le_i16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i16_sle: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_le_i16_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_le_i16_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1952,7 +1952,7 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) { define amdgpu_kernel void @v_icmp_i1_ne0(ptr addrspace(1) %out, i32 %a, i32 %b) { ; GFX11-LABEL: v_icmp_i1_ne0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_cmp_gt_u32 s2, 1 @@ -1970,7 +1970,7 @@ define amdgpu_kernel void @v_icmp_i1_ne0(ptr addrspace(1) %out, i32 %a, i32 %b) ; ; VI-LABEL: v_icmp_i1_ne0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_gt_u32 s2, 1 ; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 @@ -1986,17 +1986,17 @@ define amdgpu_kernel void @v_icmp_i1_ne0(ptr addrspace(1) %out, i32 %a, i32 %b) ; ; GFX9-LABEL: v_icmp_i1_ne0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_cmp_gt_u32 s2, 1 -; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX9-NEXT: s_cmp_gt_u32 s3, 2 +; GFX9-NEXT: s_cmp_gt_u32 s6, 1 +; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX9-NEXT: s_cmp_gt_u32 s7, 2 ; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX9-NEXT: s_and_b64 s[2:3], s[4:5], s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm %c0 = icmp ugt i32 %a, 1 %c1 = icmp ugt i32 %b, 2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll index 3168e05b816be..b0706025f0b68 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll @@ -14,13 +14,12 @@ entry: define amdgpu_kernel void @test_iglp_opt_mfma_gemm(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { ; GCN-LABEL: test_iglp_opt_mfma_gemm: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 2.0 ; GCN-NEXT: ; iglp_opt mask(0x00000000) ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_add_u32_e32 v1, s0, v0 +; GCN-NEXT: v_add_u32_e32 v1, s2, v0 ; GCN-NEXT: v_add_u32_e32 v2, 0x6000, v1 ; GCN-NEXT: ds_read_b128 a[28:31], v2 offset:57456 ; GCN-NEXT: ds_read_b128 a[24:27], v2 offset:57440 @@ -45,7 +44,7 @@ define amdgpu_kernel void @test_iglp_opt_mfma_gemm(ptr addrspace(3) noalias %in, ; GCN-NEXT: ds_read_b128 a[152:155], v1 offset:96 ; GCN-NEXT: ds_read_b128 a[68:71], v1 offset:24592 ; GCN-NEXT: ds_read_b128 a[64:67], v1 offset:24576 -; GCN-NEXT: v_add_u32_e32 v0, s1, v0 +; GCN-NEXT: v_add_u32_e32 v0, s3, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(4) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v3, a[32:63] ; GCN-NEXT: ds_read_b128 a[148:151], v1 offset:80 @@ -81,7 +80,7 @@ define amdgpu_kernel void @test_iglp_opt_mfma_gemm(ptr addrspace(3) noalias %in, ; GCN-NEXT: ds_write_b128 v0, a[136:139] offset:32 ; GCN-NEXT: ds_write_b128 v0, a[132:135] offset:16 ; GCN-NEXT: ds_write_b128 v0, a[128:131] -; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: s_waitcnt lgkmcnt(8) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v3, a[64:95] ; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:24672 @@ -152,13 +151,13 @@ entry: define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { ; GCN-LABEL: test_iglp_opt_rev_mfma_gemm: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 ; GCN-NEXT: v_mov_b32_e32 v2, 1.0 ; GCN-NEXT: v_mov_b32_e32 v3, 2.0 +; GCN-NEXT: ; iglp_opt mask(0x00000001) ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_add_u32_e32 v1, s0, v0 +; GCN-NEXT: v_add_u32_e32 v1, s2, v0 ; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:112 ; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:96 ; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:80 @@ -177,8 +176,7 @@ define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(ptr addrspace(3) noalias ; GCN-NEXT: ds_read_b128 a[136:139], v1 offset:8224 ; GCN-NEXT: ds_read_b128 a[132:135], v1 offset:8208 ; GCN-NEXT: ds_read_b128 a[128:131], v1 offset:8192 -; GCN-NEXT: v_add_u32_e32 v0, s1, v0 -; GCN-NEXT: ; iglp_opt mask(0x00000001) +; GCN-NEXT: v_add_u32_e32 v0, s3, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v3, a[128:159] ; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:24688 @@ -220,7 +218,7 @@ define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(ptr addrspace(3) noalias ; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32 ; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:16 ; GCN-NEXT: ds_write_b128 v0, a[0:3] -; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: ds_write_b128 v0, a[152:155] offset:8288 ; GCN-NEXT: ds_write_b128 v0, a[156:159] offset:8304 ; GCN-NEXT: ds_write_b128 v0, a[144:147] offset:8256 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll index 2d01703c78d78..74d936f8093dc 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll @@ -163,7 +163,7 @@ main_body: define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> inreg %tdescr) { ; GFX1013-LABEL: image_bvh_intersect_ray_nsa_reassign: ; GFX1013: ; %bb.0: ; %main_body -; GFX1013-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1013-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GFX1013-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1013-NEXT: v_mov_b32_e32 v6, 4.0 ; GFX1013-NEXT: v_mov_b32_e32 v7, 0x40a00000 @@ -171,10 +171,10 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; GFX1013-NEXT: v_mov_b32_e32 v9, 0x40e00000 ; GFX1013-NEXT: v_mov_b32_e32 v10, 0x41000000 ; GFX1013-NEXT: s_waitcnt lgkmcnt(0) -; GFX1013-NEXT: v_add_co_u32 v2, s0, s0, v0 -; GFX1013-NEXT: v_add_co_ci_u32_e64 v3, s0, s1, 0, s0 -; GFX1013-NEXT: v_add_co_u32 v4, s0, s2, v0 -; GFX1013-NEXT: v_add_co_ci_u32_e64 v5, s0, s3, 0, s0 +; GFX1013-NEXT: v_add_co_u32 v2, s0, s4, v0 +; GFX1013-NEXT: v_add_co_ci_u32_e64 v3, s0, s5, 0, s0 +; GFX1013-NEXT: v_add_co_u32 v4, s0, s6, v0 +; GFX1013-NEXT: v_add_co_ci_u32_e64 v5, s0, s7, 0, s0 ; GFX1013-NEXT: flat_load_dword v0, v[2:3] ; GFX1013-NEXT: flat_load_dword v1, v[4:5] ; GFX1013-NEXT: v_mov_b32_e32 v2, 0 @@ -182,14 +182,14 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; GFX1013-NEXT: v_mov_b32_e32 v4, 2.0 ; GFX1013-NEXT: v_mov_b32_e32 v5, 0x40400000 ; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:10], s[4:7] +; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:10], s[8:11] ; GFX1013-NEXT: s_waitcnt vmcnt(0) ; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3] ; GFX1013-NEXT: s_endpgm ; ; GFX1030-LABEL: image_bvh_intersect_ray_nsa_reassign: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX1030-NEXT: v_mov_b32_e32 v10, 0x41000000 ; GFX1030-NEXT: v_mov_b32_e32 v9, 0x40e00000 @@ -215,13 +215,11 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; ; GFX11-LABEL: image_bvh_intersect_ray_nsa_reassign: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v7, 1.0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: v_dual_mov_b32 v5, 0x40a00000 :: v_dual_mov_b32 v6, 0 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v5, 0x40a00000 :: v_dual_lshlrev_b32 v2, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v6, 0 ; GFX11-NEXT: v_mov_b32_e32 v8, 2.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX11-NEXT: v_mov_b32_e32 v4, 4.0 +; GFX11-NEXT: v_dual_mov_b32 v4, 4.0 :: v_dual_mov_b32 v7, 1.0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v0, s0, s0, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -230,8 +228,8 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s0 ; GFX11-NEXT: flat_load_b32 v9, v[0:1] ; GFX11-NEXT: flat_load_b32 v10, v[2:3] -; GFX11-NEXT: v_mov_b32_e32 v1, 0x40e00000 ; GFX11-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX11-NEXT: v_mov_b32_e32 v1, 0x40e00000 ; GFX11-NEXT: v_mov_b32_e32 v2, 0x41000000 ; GFX11-NEXT: v_mov_b32_e32 v3, 0x40400000 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -262,15 +260,15 @@ main_body: define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> inreg %tdescr) { ; GFX1013-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: ; GFX1013: ; %bb.0: ; %main_body -; GFX1013-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1013-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GFX1013-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1013-NEXT: v_mov_b32_e32 v6, 0x46004500 ; GFX1013-NEXT: v_mov_b32_e32 v7, 0x48004700 ; GFX1013-NEXT: s_waitcnt lgkmcnt(0) -; GFX1013-NEXT: v_add_co_u32 v2, s0, s0, v0 -; GFX1013-NEXT: v_add_co_ci_u32_e64 v3, s0, s1, 0, s0 -; GFX1013-NEXT: v_add_co_u32 v4, s0, s2, v0 -; GFX1013-NEXT: v_add_co_ci_u32_e64 v5, s0, s3, 0, s0 +; GFX1013-NEXT: v_add_co_u32 v2, s0, s4, v0 +; GFX1013-NEXT: v_add_co_ci_u32_e64 v3, s0, s5, 0, s0 +; GFX1013-NEXT: v_add_co_u32 v4, s0, s6, v0 +; GFX1013-NEXT: v_add_co_ci_u32_e64 v5, s0, s7, 0, s0 ; GFX1013-NEXT: flat_load_dword v0, v[2:3] ; GFX1013-NEXT: flat_load_dword v1, v[4:5] ; GFX1013-NEXT: v_mov_b32_e32 v2, 0 @@ -278,14 +276,14 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ ; GFX1013-NEXT: v_mov_b32_e32 v4, 2.0 ; GFX1013-NEXT: v_mov_b32_e32 v5, 0x44004200 ; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[4:7] a16 +; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[8:11] a16 ; GFX1013-NEXT: s_waitcnt vmcnt(0) ; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3] ; GFX1013-NEXT: s_endpgm ; ; GFX1030-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX1030-NEXT: v_mov_b32_e32 v4, 2.0 ; GFX1030-NEXT: v_mov_b32_e32 v5, 0x44004200 @@ -308,22 +306,21 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ ; ; GFX11-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_dual_mov_b32 v4, 1.0 :: v_dual_mov_b32 v5, 2.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX11-NEXT: v_dual_mov_b32 v4, 1.0 :: v_dual_mov_b32 v5, 2.0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_u32 v0, s0, s0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 ; GFX11-NEXT: v_add_co_u32 v2, s0, s2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s0 ; GFX11-NEXT: flat_load_b32 v6, v[0:1] ; GFX11-NEXT: flat_load_b32 v7, v[2:3] ; GFX11-NEXT: v_mov_b32_e32 v1, 0x47004400 -; GFX11-NEXT: v_mov_b32_e32 v0, 0x46004200 -; GFX11-NEXT: v_dual_mov_b32 v2, 0x48004500 :: v_dual_mov_b32 v3, 0 +; GFX11-NEXT: v_dual_mov_b32 v0, 0x46004200 :: v_dual_mov_b32 v3, 0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0x48004500 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v6, v7, v[3:5], v[0:2]], s[4:7] a16 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -352,9 +349,9 @@ main_body: define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 x i32> inreg %tdescr) { ; GFX1013-LABEL: image_bvh64_intersect_ray_nsa_reassign: ; GFX1013: ; %bb.0: ; %main_body -; GFX1013-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1013-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX1013-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX1013-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x34 +; GFX1013-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 ; GFX1013-NEXT: v_mov_b32_e32 v3, 0 ; GFX1013-NEXT: v_mov_b32_e32 v4, 1.0 ; GFX1013-NEXT: v_mov_b32_e32 v5, 2.0 @@ -369,7 +366,7 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 ; GFX1013-NEXT: v_add_co_ci_u32_e64 v1, s4, s5, 0, s4 ; GFX1013-NEXT: flat_load_dword v2, v[0:1] ; GFX1013-NEXT: v_mov_b32_e32 v0, 0xb36211c7 -; GFX1013-NEXT: v_bfrev_b32_e32 v1, 4.0 +; GFX1013-NEXT: v_mov_b32_e32 v1, 0x102 ; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[0:3] ; GFX1013-NEXT: s_waitcnt vmcnt(0) @@ -378,9 +375,9 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 ; ; GFX1030-LABEL: image_bvh64_intersect_ray_nsa_reassign: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x34 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 ; GFX1030-NEXT: v_mov_b32_e32 v3, 0 ; GFX1030-NEXT: v_mov_b32_e32 v11, 0x41000000 ; GFX1030-NEXT: v_mov_b32_e32 v10, 0x40e00000 @@ -394,7 +391,7 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 ; GFX1030-NEXT: v_add_co_u32 v0, s4, s4, v0 ; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s4 ; GFX1030-NEXT: flat_load_dword v2, v[0:1] -; GFX1030-NEXT: v_bfrev_b32_e32 v1, 4.0 +; GFX1030-NEXT: v_mov_b32_e32 v1, 0x102 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0xb36211c7 ; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[0:3] @@ -404,16 +401,14 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 ; ; GFX11-LABEL: image_bvh64_intersect_ray_nsa_reassign: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v7, 1.0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x34 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v2, 0x41000000 ; GFX11-NEXT: v_dual_mov_b32 v3, 0x40400000 :: v_dual_mov_b32 v4, 4.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_dual_mov_b32 v5, 0x40a00000 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-NEXT: v_mov_b32_e32 v6, 0 +; GFX11-NEXT: v_dual_mov_b32 v5, 0x40a00000 :: v_dual_mov_b32 v6, 0 ; GFX11-NEXT: v_dual_mov_b32 v8, 2.0 :: v_dual_mov_b32 v9, 0xb36211c7 -; GFX11-NEXT: v_bfrev_b32_e32 v10, 4.0 +; GFX11-NEXT: v_dual_mov_b32 v10, 0x102 :: v_dual_mov_b32 v7, 1.0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v0, s4, s4, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -447,9 +442,9 @@ main_body: define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray, <4 x i32> inreg %tdescr) { ; GFX1013-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign: ; GFX1013: ; %bb.0: ; %main_body -; GFX1013-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1013-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX1013-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX1013-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x34 +; GFX1013-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 ; GFX1013-NEXT: v_mov_b32_e32 v3, 0 ; GFX1013-NEXT: v_mov_b32_e32 v4, 1.0 ; GFX1013-NEXT: v_mov_b32_e32 v5, 2.0 @@ -461,7 +456,7 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray ; GFX1013-NEXT: v_add_co_ci_u32_e64 v1, s4, s5, 0, s4 ; GFX1013-NEXT: flat_load_dword v2, v[0:1] ; GFX1013-NEXT: v_mov_b32_e32 v0, 0xb36211c6 -; GFX1013-NEXT: v_bfrev_b32_e32 v1, 4.0 +; GFX1013-NEXT: v_mov_b32_e32 v1, 0x102 ; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[0:3] a16 ; GFX1013-NEXT: s_waitcnt vmcnt(0) @@ -470,9 +465,9 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray ; ; GFX1030-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x34 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 ; GFX1030-NEXT: v_mov_b32_e32 v3, 0 ; GFX1030-NEXT: v_mov_b32_e32 v5, 2.0 ; GFX1030-NEXT: v_mov_b32_e32 v4, 1.0 @@ -483,7 +478,7 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray ; GFX1030-NEXT: v_add_co_u32 v0, s4, s4, v0 ; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s4 ; GFX1030-NEXT: flat_load_dword v2, v[0:1] -; GFX1030-NEXT: v_bfrev_b32_e32 v1, 4.0 +; GFX1030-NEXT: v_mov_b32_e32 v1, 0x102 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0xb36211c6 ; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[0:3] a16 @@ -493,23 +488,19 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray ; ; GFX11-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x34 -; GFX11-NEXT: v_mov_b32_e32 v2, 0x48004500 -; GFX11-NEXT: v_mov_b32_e32 v4, 1.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: v_mov_b32_e32 v6, 0xb36211c6 -; GFX11-NEXT: v_bfrev_b32_e32 v7, 4.0 -; GFX11-NEXT: v_mov_b32_e32 v5, 2.0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34 +; GFX11-NEXT: v_dual_mov_b32 v2, 0x48004500 :: v_dual_mov_b32 v5, 2.0 +; GFX11-NEXT: v_dual_mov_b32 v4, 1.0 :: v_dual_mov_b32 v7, 0x102 +; GFX11-NEXT: v_dual_mov_b32 v6, 0xb36211c6 :: v_dual_mov_b32 v3, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v0, s4, s4, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s4 ; GFX11-NEXT: flat_load_b32 v8, v[0:1] -; GFX11-NEXT: v_mov_b32_e32 v1, 0x47004400 ; GFX11-NEXT: v_mov_b32_e32 v0, 0x46004200 +; GFX11-NEXT: v_mov_b32_e32 v1, 0x47004400 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[6:7], v8, v[3:5], v[0:2]], s[0:3] a16 ; GFX11-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll index 8e9a652ae8a8e..cd92529b77165 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll @@ -11,24 +11,24 @@ declare i32 @llvm.amdgcn.permlanex16(i32, i32, i32, i32, i1, i1) declare i32 @llvm.amdgcn.workitem.id.x() declare i32 @llvm.amdgcn.workitem.id.y() -define amdgpu_kernel void @v_permlane16_b32_vss_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlane16_b32_vss_i32: +define amdgpu_kernel void @v_permlane16_b32_vss(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlane16_b32_vss: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s0 +; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s2 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: v_permlane16_b32_vss_i32: +; GFX11-LABEL: v_permlane16_b32_vss: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -38,11 +38,11 @@ define amdgpu_kernel void @v_permlane16_b32_vss_i32(ptr addrspace(1) %out, i32 % ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: v_permlane16_b32_vss_i32: +; GFX12-LABEL: v_permlane16_b32_vss: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -51,847 +51,687 @@ define amdgpu_kernel void @v_permlane16_b32_vss_i32(ptr addrspace(1) %out, i32 % ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm - %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) + %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) store i32 %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlane16_b32_vss_f32(ptr addrspace(1) %out, float %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlane16_b32_vss_f32: +define amdgpu_kernel void @v_permlane16_b32_vii(ptr addrspace(1) %out, i32 %src0) { +; GFX10-LABEL: v_permlane16_b32_vii: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_permlane16_b32 v0, v0, 1, 2 +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: v_permlane16_b32_vss_f32: +; GFX11-LABEL: v_permlane16_b32_vii: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s7, s0 -; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: v_permlane16_b32 v0, v0, 1, 2 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: v_permlane16_b32_vss_f32: +; GFX12-LABEL: v_permlane16_b32_vii: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s7, s0 -; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX12-NEXT: v_permlane16_b32 v0, v0, 1, 2 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm - %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 false, i1 false) - store float %v, ptr addrspace(1) %out + %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 1, i32 2, i1 false, i1 false) + store i32 %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlane16_b32_vss_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1, i32 %src2) { -; GFX10-SDAG-LABEL: v_permlane16_b32_vss_i64: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-SDAG-NEXT: s_endpgm +; FIXME-GFX10PLUS: It is allowed to have both immediates as literals +define amdgpu_kernel void @v_permlane16_b32_vll(ptr addrspace(1) %out, i32 %src0) { +; GFX10-LABEL: v_permlane16_b32_vll: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_movk_i32 s0, 0x1234 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, 0xc1d1 +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10-NEXT: s_endpgm ; -; GFX10-GISEL-LABEL: v_permlane16_b32_vss_i64: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-GISEL-NEXT: s_endpgm +; GFX11-LABEL: v_permlane16_b32_vll: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-NEXT: s_movk_i32 s2, 0x1234 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlane16_b32_vll: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX12-NEXT: s_movk_i32 s2, 0x1234 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 4660, i32 49617, i1 false, i1 false) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_vvv(ptr addrspace(1) %out, i32 %src0) { +; GFX10-LABEL: v_permlane16_b32_vvv: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_mov_b32 null, 0 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10-NEXT: s_endpgm ; -; GFX11-SDAG-LABEL: v_permlane16_b32_vss_i64: +; GFX11-SDAG-LABEL: v_permlane16_b32_vvv: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v1 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; -; GFX11-GISEL-LABEL: v_permlane16_b32_vss_i64: +; GFX11-GISEL-LABEL: v_permlane16_b32_vvv: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v1 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 +; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; -; GFX12-SDAG-LABEL: v_permlane16_b32_vss_i64: +; GFX12-SDAG-LABEL: v_permlane16_b32_vvv: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s3, v1 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 +; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; -; GFX12-GISEL-LABEL: v_permlane16_b32_vss_i64: +; GFX12-GISEL-LABEL: v_permlane16_b32_vvv: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s3, v1 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 +; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm - %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 false, i1 false) - store i64 %v, ptr addrspace(1) %out + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %tidx, i32 %tidy, i1 false, i1 false) + store i32 %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlane16_b32_vss_f64(ptr addrspace(1) %out, double %src0, i32 %src1, i32 %src2) { -; GFX10-SDAG-LABEL: v_permlane16_b32_vss_f64: +define amdgpu_kernel void @v_permlane16_b32_vvs(ptr addrspace(1) %out, i32 %src0, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_b32_vvs: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_mov_b32 null, 0 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s7 +; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; -; GFX10-GISEL-LABEL: v_permlane16_b32_vss_f64: +; GFX10-GISEL-LABEL: v_permlane16_b32_vvs: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_mov_b32 null, 0 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s7 +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; -; GFX11-SDAG-LABEL: v_permlane16_b32_vss_f64: +; GFX11-SDAG-LABEL: v_permlane16_b32_vvs: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; -; GFX11-GISEL-LABEL: v_permlane16_b32_vss_f64: +; GFX11-GISEL-LABEL: v_permlane16_b32_vvs: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s3 +; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; -; GFX12-SDAG-LABEL: v_permlane16_b32_vss_f64: +; GFX12-SDAG-LABEL: v_permlane16_b32_vvs: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16_b32_vvs: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s3 +; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %tidx, i32 %src2, i1 false, i1 false) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_vsv(ptr addrspace(1) %out, i32 %src0, i32 %src1) { +; GFX10-LABEL: v_permlane16_b32_vsv: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_mov_b32 null, 0 +; GFX10-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s0 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlane16_b32_vsv: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlane16_b32_vsv: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 +; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlane16_b32_vsv: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 +; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; -; GFX12-GISEL-LABEL: v_permlane16_b32_vss_f64: +; GFX12-GISEL-LABEL: v_permlane16_b32_vsv: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 +; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm - %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 false, i1 false) - store double %v, ptr addrspace(1) %out + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %tidy, i1 false, i1 false) + store i32 %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlane16_b32_vii_i32(ptr addrspace(1) %out, i32 %src0) { -; GFX10-LABEL: v_permlane16_b32_vii_i32: +define amdgpu_kernel void @v_permlane16_b32_vss_fi(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlane16_b32_vss_fi: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_permlane16_b32 v0, v0, 1, 2 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s2 op_sel:[1,0] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: v_permlane16_b32_vii_i32: +; GFX11-LABEL: v_permlane16_b32_vss_fi: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlane16_b32 v0, v0, 1, 2 -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,0] +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: v_permlane16_b32_vii_i32: +; GFX12-LABEL: v_permlane16_b32_vss_fi: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlane16_b32 v0, v0, 1, 2 -; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,0] +; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm - %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 1, i32 2, i1 false, i1 false) + %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 false) store i32 %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlane16_b32_vii_f32(ptr addrspace(1) %out, float %src0) { -; GFX10-LABEL: v_permlane16_b32_vii_f32: +define amdgpu_kernel void @v_permlane16_b32_vss_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlane16_b32_vss_bc: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_permlane16_b32 v0, v0, 1, 2 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s2 op_sel:[0,1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: v_permlane16_b32_vii_f32: +; GFX11-LABEL: v_permlane16_b32_vss_bc: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlane16_b32 v0, v0, 1, 2 -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[0,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: v_permlane16_b32_vii_f32: +; GFX12-LABEL: v_permlane16_b32_vss_bc: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlane16_b32 v0, v0, 1, 2 -; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[0,1] +; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm - %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 1, i32 2, i1 false, i1 false) - store float %v, ptr addrspace(1) %out + %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 true) + store i32 %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlane16_b32_vii_i64(ptr addrspace(1) %out, i64 %src0) { -; GFX10-SDAG-LABEL: v_permlane16_b32_vii_i64: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, 1, 2 -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, 1, 2 -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlane16_b32_vii_i64: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, 1, 2 -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, 1, 2 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX10-GISEL-NEXT: s_endpgm +define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlane16_b32_vss_fi_bc: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s2 op_sel:[1,1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm ; -; GFX11-SDAG-LABEL: v_permlane16_b32_vii_i64: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, 1, 2 -; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, 1, 2 -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: v_permlane16_b32_vii_i64: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, 1, 2 -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, 1, 2 -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: v_permlane16_b32_vii_i64: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, 1, 2 -; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, 1, 2 -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm +; GFX11-LABEL: v_permlane16_b32_vss_fi_bc: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ; -; GFX12-GISEL-LABEL: v_permlane16_b32_vii_i64: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, 1, 2 -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, 1, 2 -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm - %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 1, i32 2, i1 false, i1 false) - store i64 %v, ptr addrspace(1) %out +; GFX12-LABEL: v_permlane16_b32_vss_fi_bc: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,1] +; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 true) + store i32 %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlane16_b32_vii_f64(ptr addrspace(1) %out, double %src0) { -; GFX10-SDAG-LABEL: v_permlane16_b32_vii_f64: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, 1, 2 -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, 1, 2 -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlane16_b32_vii_f64: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, 1, 2 -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, 1, 2 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX10-GISEL-NEXT: s_endpgm -; -; GFX11-SDAG-LABEL: v_permlane16_b32_vii_f64: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, 1, 2 -; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, 1, 2 -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: v_permlane16_b32_vii_f64: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, 1, 2 -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, 1, 2 -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm +define amdgpu_kernel void @v_permlanex16_b32_vss(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlanex16_b32_vss: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s2 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm ; -; GFX12-SDAG-LABEL: v_permlane16_b32_vii_f64: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, 1, 2 -; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, 1, 2 -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm +; GFX11-LABEL: v_permlanex16_b32_vss: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s7, s0 +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ; -; GFX12-GISEL-LABEL: v_permlane16_b32_vii_f64: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, 1, 2 -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, 1, 2 -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm - %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 1, i32 2, i1 false, i1 false) - store double %v, ptr addrspace(1) %out +; GFX12-LABEL: v_permlanex16_b32_vss: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s7, s0 +; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store i32 %v, ptr addrspace(1) %out ret void } -; FIXME-GFX10PLUS: It is allowed to have both immediates as literals -define amdgpu_kernel void @v_permlane16_b32_vll_i32(ptr addrspace(1) %out, i32 %src0) { -; GFX10-LABEL: v_permlane16_b32_vll_i32: +define amdgpu_kernel void @v_permlanex16_b32_vii(ptr addrspace(1) %out, i32 %src0) { +; GFX10-LABEL: v_permlanex16_b32_vii: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX10-NEXT: s_movk_i32 s2, 0x1234 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_permlanex16_b32 v0, v0, 1, 2 +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: v_permlane16_b32_vll_i32: +; GFX11-LABEL: v_permlanex16_b32_vii: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: s_movk_i32 s2, 0x1234 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 +; GFX11-NEXT: v_permlanex16_b32 v0, v0, 1, 2 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: v_permlane16_b32_vll_i32: +; GFX12-LABEL: v_permlanex16_b32_vii: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX12-NEXT: s_movk_i32 s2, 0x1234 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlanex16_b32 v0, v0, 1, 2 ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm - %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 4660, i32 49617, i1 false, i1 false) + %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 1, i32 2, i1 false, i1 false) store i32 %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlane16_b32_vll_i64(ptr addrspace(1) %out, i64 %src0) { -; GFX10-SDAG-LABEL: v_permlane16_b32_vll_i64: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-SDAG-NEXT: s_movk_i32 s2, 0x1234 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1 -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlane16_b32_vll_i64: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-GISEL-NEXT: s_movk_i32 s2, 0x1234 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX10-GISEL-NEXT: s_endpgm -; -; GFX11-SDAG-LABEL: v_permlane16_b32_vll_i64: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-SDAG-NEXT: s_movk_i32 s2, 0x1234 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: v_permlane16_b32_vll_i64: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-GISEL-NEXT: s_movk_i32 s2, 0x1234 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1 -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: v_permlane16_b32_vll_i64: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x1234 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: v_permlane16_b32_vll_i64: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX12-GISEL-NEXT: s_movk_i32 s2, 0x1234 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1 -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm - %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 4660, i32 49617, i1 false, i1 false) - store i64 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlane16_b32_vll_f32(ptr addrspace(1) %out,float %src0) { -; GFX10-LABEL: v_permlane16_b32_vll_f32: +; FIXME-GFX10PLUS: It is allowed to have both immediates as literals +define amdgpu_kernel void @v_permlanex16_b32_vll(ptr addrspace(1) %out, i32 %src0) { +; GFX10-LABEL: v_permlanex16_b32_vll: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX10-NEXT: s_movk_i32 s2, 0x1234 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_movk_i32 s0, 0x1234 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, 0xc1d1 +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: v_permlane16_b32_vll_f32: +; GFX11-LABEL: v_permlanex16_b32_vll: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: s_movk_i32 s2, 0x1234 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-NEXT: s_movk_i32 s2, 0x1234 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: v_permlane16_b32_vll_f32: +; GFX12-LABEL: v_permlanex16_b32_vll: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: s_movk_i32 s2, 0x1234 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm - %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 4660, i32 49617, i1 false, i1 false) - store float %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlane16_b32_vll_f64(ptr addrspace(1) %out, double %src0) { -; GFX10-SDAG-LABEL: v_permlane16_b32_vll_f64: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-SDAG-NEXT: s_movk_i32 s2, 0x1234 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1 -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlane16_b32_vll_f64: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-GISEL-NEXT: s_movk_i32 s2, 0x1234 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX10-GISEL-NEXT: s_endpgm -; -; GFX11-SDAG-LABEL: v_permlane16_b32_vll_f64: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-SDAG-NEXT: s_movk_i32 s2, 0x1234 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: v_permlane16_b32_vll_f64: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-GISEL-NEXT: s_movk_i32 s2, 0x1234 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1 -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: v_permlane16_b32_vll_f64: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x1234 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: v_permlane16_b32_vll_f64: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX12-GISEL-NEXT: s_movk_i32 s2, 0x1234 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1 -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm - %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 4660, i32 49617, i1 false, i1 false) - store double %v, ptr addrspace(1) %out + %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 4660, i32 49617, i1 false, i1 false) + store i32 %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlane16_b32_vvv_i32(ptr addrspace(1) %out, i32 %src0) { -; GFX10-LABEL: v_permlane16_b32_vvv_i32: +define amdgpu_kernel void @v_permlanex16_b32_vvv(ptr addrspace(1) %out, i32 %src0) { +; GFX10-LABEL: v_permlanex16_b32_vvv: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: s_mov_b32 null, 0 -; GFX10-NEXT: v_readfirstlane_b32 s2, v0 -; GFX10-NEXT: v_readfirstlane_b32 s3, v1 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; -; GFX11-SDAG-LABEL: v_permlane16_b32_vvv_i32: +; GFX11-SDAG-LABEL: v_permlanex16_b32_vvv: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v1 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s4 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; -; GFX11-GISEL-LABEL: v_permlane16_b32_vvv_i32: +; GFX11-GISEL-LABEL: v_permlanex16_b32_vvv: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v0 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v1 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; -; GFX12-SDAG-LABEL: v_permlane16_b32_vvv_i32: +; GFX12-SDAG-LABEL: v_permlanex16_b32_vvv: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) @@ -901,15 +741,15 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_i32(ptr addrspace(1) %out, i32 % ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; -; GFX12-GISEL-LABEL: v_permlane16_b32_vvv_i32: +; GFX12-GISEL-LABEL: v_permlanex16_b32_vvv: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) @@ -918,9135 +758,925 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_i32(ptr addrspace(1) %out, i32 % ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s3, v1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidy = call i32 @llvm.amdgcn.workitem.id.y() - %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %tidx, i32 %tidy, i1 false, i1 false) + %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %tidx, i32 %tidy, i1 false, i1 false) store i32 %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlane16_b32_vvv_i64(ptr addrspace(1) %out, i64 %src0) { -; GFX10-SDAG-LABEL: v_permlane16_b32_vvv_i64: +define amdgpu_kernel void @v_permlanex16_b32_vvs(ptr addrspace(1) %out, i32 %src0, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_vvs: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v0 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v1 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_mov_b32 null, 0 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s4, s5 -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s4, s5 -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s7 +; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; -; GFX10-GISEL-LABEL: v_permlane16_b32_vvv_i64: +; GFX10-GISEL-LABEL: v_permlanex16_b32_vvs: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v1 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_mov_b32 null, 0 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s5 -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s4, s5 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s7 +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; -; GFX11-LABEL: v_permlane16_b32_vvv_i64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX11-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_readfirstlane_b32 s5, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-NEXT: v_readfirstlane_b32 s4, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-NEXT: v_permlane16_b32 v0, v0, s4, s5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_permlane16_b32 v1, v1, s4, s5 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm -; -; GFX12-LABEL: v_permlane16_b32_vvv_i64: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX12-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_readfirstlane_b32 s5, v0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: v_readfirstlane_b32 s4, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, s3 -; GFX12-NEXT: v_permlane16_b32 v0, v0, s4, s5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_permlane16_b32 v1, v1, s4, s5 -; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %tidy = call i32 @llvm.amdgcn.workitem.id.y() - %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %tidx, i32 %tidy, i1 false, i1 false) - store i64 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlane16_b32_vvv_f32(ptr addrspace(1) %out, float %src0) { -; GFX10-LABEL: v_permlane16_b32_vvv_f32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX10-NEXT: s_mov_b32 null, 0 -; GFX10-NEXT: v_readfirstlane_b32 s2, v0 -; GFX10-NEXT: v_readfirstlane_b32 s3, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] -; GFX10-NEXT: s_endpgm -; -; GFX11-SDAG-LABEL: v_permlane16_b32_vvv_f32: +; GFX11-SDAG-LABEL: v_permlanex16_b32_vvs: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s4 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; -; GFX11-GISEL-LABEL: v_permlane16_b32_vvv_f32: +; GFX11-GISEL-LABEL: v_permlanex16_b32_vvs: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v0 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s2, v1 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s3 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; -; GFX12-SDAG-LABEL: v_permlane16_b32_vvv_f32: +; GFX12-SDAG-LABEL: v_permlanex16_b32_vvs: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s3, v1 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; -; GFX12-GISEL-LABEL: v_permlane16_b32_vvv_f32: +; GFX12-GISEL-LABEL: v_permlanex16_b32_vvs: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s3, v1 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s3 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %tidy = call i32 @llvm.amdgcn.workitem.id.y() - %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %tidx, i32 %tidy, i1 false, i1 false) - store float %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlane16_b32_vvv_f64(ptr addrspace(1) %out, double %src0) { -; GFX10-SDAG-LABEL: v_permlane16_b32_vvv_f64: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v0 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v1 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s4, s5 -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s4, s5 -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlane16_b32_vvv_f64: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v1 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s5 -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s4, s5 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX10-GISEL-NEXT: s_endpgm -; -; GFX11-LABEL: v_permlane16_b32_vvv_f64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX11-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_readfirstlane_b32 s5, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-NEXT: v_readfirstlane_b32 s4, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-NEXT: v_permlane16_b32 v0, v0, s4, s5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_permlane16_b32 v1, v1, s4, s5 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm -; -; GFX12-LABEL: v_permlane16_b32_vvv_f64: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX12-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_readfirstlane_b32 s5, v0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: v_readfirstlane_b32 s4, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, s3 -; GFX12-NEXT: v_permlane16_b32 v0, v0, s4, s5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_permlane16_b32 v1, v1, s4, s5 -; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %tidy = call i32 @llvm.amdgcn.workitem.id.y() - %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %tidx, i32 %tidy, i1 false, i1 false) - store double %v, ptr addrspace(1) %out + %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %tidx, i32 %src2, i1 false, i1 false) + store i32 %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlane16_b32_vvs_i32(ptr addrspace(1) %out, i32 %src0, i32 %src2) { -; GFX10-SDAG-LABEL: v_permlane16_b32_vvs_i32: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 -; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlane16_b32_vvs_i32: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s3 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] -; GFX10-GISEL-NEXT: s_endpgm +define amdgpu_kernel void @v_permlanex16_b32_vsv(ptr addrspace(1) %out, i32 %src0, i32 %src1) { +; GFX10-LABEL: v_permlanex16_b32_vsv: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_mov_b32 null, 0 +; GFX10-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s0 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm ; -; GFX11-SDAG-LABEL: v_permlane16_b32_vvs_i32: +; GFX11-SDAG-LABEL: v_permlanex16_b32_vsv: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; -; GFX11-GISEL-LABEL: v_permlane16_b32_vvs_i32: +; GFX11-GISEL-LABEL: v_permlanex16_b32_vsv: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s3 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; -; GFX12-SDAG-LABEL: v_permlane16_b32_vvs_i32: +; GFX12-SDAG-LABEL: v_permlanex16_b32_vsv: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; -; GFX12-GISEL-LABEL: v_permlane16_b32_vvs_i32: +; GFX12-GISEL-LABEL: v_permlanex16_b32_vsv: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s3 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %tidx, i32 %src2, i1 false, i1 false) + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %tidy, i1 false, i1 false) store i32 %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlane16_b32_vvs_i64(ptr addrspace(1) %out, i64 %src0, i32 %src2) { -; GFX10-SDAG-LABEL: v_permlane16_b32_vvs_i64: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s1, v0 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s1, s0 -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s1, s0 -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlane16_b32_vvs_i64: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s1, v0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s1, s0 -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s1, s0 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-GISEL-NEXT: s_endpgm +define amdgpu_kernel void @v_permlanex16_b32_vss_fi(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlanex16_b32_vss_fi: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s2 op_sel:[1,0] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: v_permlane16_b32_vvs_i64: +; GFX11-LABEL: v_permlanex16_b32_vss_fi: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_readfirstlane_b32 s1, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-NEXT: v_permlane16_b32 v1, v1, s1, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s1, s0 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,0] +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: v_permlane16_b32_vvs_i64: +; GFX12-LABEL: v_permlanex16_b32_vss_fi: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_readfirstlane_b32 s1, v0 -; GFX12-NEXT: v_mov_b32_e32 v0, s6 -; GFX12-NEXT: v_permlane16_b32 v1, v1, s1, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s1, s0 -; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,0] +; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %tidx, i32 %src2, i1 false, i1 false) - store i64 %v, ptr addrspace(1) %out + %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 false) + store i32 %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlane16_b32_vvs_f32(ptr addrspace(1) %out, float %src0, i32 %src2) { -; GFX10-SDAG-LABEL: v_permlane16_b32_vvs_f32: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 -; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlane16_b32_vvs_f32: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s3 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] -; GFX10-GISEL-NEXT: s_endpgm +define amdgpu_kernel void @v_permlanex16_b32_vss_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlanex16_b32_vss_bc: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s2 op_sel:[0,1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm ; -; GFX11-SDAG-LABEL: v_permlane16_b32_vvs_f32: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 -; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm +; GFX11-LABEL: v_permlanex16_b32_vss_bc: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[0,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ; -; GFX11-GISEL-LABEL: v_permlane16_b32_vvs_f32: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s3 -; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm +; GFX12-LABEL: v_permlanex16_b32_vss_bc: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[0,1] +; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 true) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlanex16_b32_vss_fi_bc: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s2 op_sel:[1,1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm ; -; GFX12-SDAG-LABEL: v_permlane16_b32_vvs_f32: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 -; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm +; GFX11-LABEL: v_permlanex16_b32_vss_fi_bc: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ; -; GFX12-GISEL-LABEL: v_permlane16_b32_vvs_f32: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s3 -; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %tidx, i32 %src2, i1 false, i1 false) - store float %v, ptr addrspace(1) %out +; GFX12-LABEL: v_permlanex16_b32_vss_fi_bc: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,1] +; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 true) + store i32 %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlane16_b32_vvs_f64(ptr addrspace(1) %out, double %src0, i32 %src2) { -; GFX10-SDAG-LABEL: v_permlane16_b32_vvs_f64: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s1, v0 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s1, s0 -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s1, s0 -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlane16_b32_vvs_f64: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s1, v0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s1, s0 -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s1, s0 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-GISEL-NEXT: s_endpgm +define amdgpu_kernel void @v_permlane16_b32_tid_tid(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlane16_b32_tid_tid: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: v_permlane16_b32_vvs_f64: +; GFX11-LABEL: v_permlane16_b32_tid_tid: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_readfirstlane_b32 s1, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-NEXT: v_permlane16_b32 v1, v1, s1, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s1, s0 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: v_permlane16_b32_vvs_f64: +; GFX12-LABEL: v_permlane16_b32_tid_tid: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_readfirstlane_b32 s1, v0 -; GFX12-NEXT: v_mov_b32_e32 v0, s6 -; GFX12-NEXT: v_permlane16_b32 v1, v1, s1, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s1, s0 -; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %tidx, i32 %src2, i1 false, i1 false) - store double %v, ptr addrspace(1) %out + %v = call i32 @llvm.amdgcn.permlane16(i32 %tidx, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false) + store i32 %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlane16_b32_vsv_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1) { -; GFX10-SDAG-LABEL: v_permlane16_b32_vsv_i32: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v1 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s3, s2 -; GFX10-SDAG-NEXT: global_store_dword v1, v0, s[0:1] -; GFX10-SDAG-NEXT: s_endpgm +define amdgpu_kernel void @v_permlane16_b32_undef_tid(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlane16_b32_undef_tid: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm ; -; GFX10-GISEL-LABEL: v_permlane16_b32_vsv_i32: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v1 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] -; GFX10-GISEL-NEXT: s_endpgm +; GFX11-LABEL: v_permlane16_b32_undef_tid: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ; -; GFX11-SDAG-LABEL: v_permlane16_b32_vsv_i32: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 -; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: v_permlane16_b32_vsv_i32: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 -; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: v_permlane16_b32_vsv_i32: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 -; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: v_permlane16_b32_vsv_i32: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 -; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm - %tidy = call i32 @llvm.amdgcn.workitem.id.y() - %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %tidy, i1 false, i1 false) +; GFX12-LABEL: v_permlane16_b32_undef_tid: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %undef = freeze i32 poison + %v = call i32 @llvm.amdgcn.permlane16(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false) store i32 %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlane16_b32_vsv_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1) { -; GFX10-SDAG-LABEL: v_permlane16_b32_vsv_i64: +define amdgpu_kernel void @v_permlane16_b32_i_tid(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x3039 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v0, s2, s3 +; GFX10-SDAG-NEXT: global_store_dword v2, v1, s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; -; GFX10-GISEL-LABEL: v_permlane16_b32_vsv_i64: +; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s1, v1 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v0, s2, s3 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; -; GFX11-SDAG-LABEL: v_permlane16_b32_vsv_i64: +; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v0, s6 -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v0, s2, s3 +; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; -; GFX11-GISEL-LABEL: v_permlane16_b32_vsv_i64: +; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v0, s2, s3 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; -; GFX12-SDAG-LABEL: v_permlane16_b32_vsv_i64: +; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v0 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v0, s6 -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v0, s2, s3 +; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; -; GFX12-GISEL-LABEL: v_permlane16_b32_vsv_i64: +; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v0, s2, s3 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm - %tidy = call i32 @llvm.amdgcn.workitem.id.y() - %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %tidy, i1 false, i1 false) - store i64 %v, ptr addrspace(1) %out + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %v = call i32 @llvm.amdgcn.permlane16(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false) + store i32 %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlane16_b32_vsv_f32(ptr addrspace(1) %out, float %src0, i32 %src1) { -; GFX10-SDAG-LABEL: v_permlane16_b32_vsv_f32: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v1 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s3, s2 -; GFX10-SDAG-NEXT: global_store_dword v1, v0, s[0:1] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlane16_b32_vsv_f32: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v1 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] -; GFX10-GISEL-NEXT: s_endpgm -; -; GFX11-SDAG-LABEL: v_permlane16_b32_vsv_f32: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 -; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: v_permlane16_b32_vsv_f32: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 -; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm +define amdgpu_kernel void @v_permlane16_b32_i_tid_fi(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlane16_b32_i_tid_fi: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm ; -; GFX12-SDAG-LABEL: v_permlane16_b32_vsv_f32: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 -; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm +; GFX11-LABEL: v_permlane16_b32_i_tid_fi: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ; -; GFX12-GISEL-LABEL: v_permlane16_b32_vsv_f32: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 -; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm - %tidy = call i32 @llvm.amdgcn.workitem.id.y() - %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %src1, i32 %tidy, i1 false, i1 false) - store float %v, ptr addrspace(1) %out +; GFX12-LABEL: v_permlane16_b32_i_tid_fi: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %undef = freeze i32 poison + %v = call i32 @llvm.amdgcn.permlane16(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 true, i1 false) + store i32 %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlane16_b32_vsv_f64(ptr addrspace(1) %out, double %src0, i32 %src1) { -; GFX10-SDAG-LABEL: v_permlane16_b32_vsv_f64: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s1, v1 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlane16_b32_vsv_f64: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s1, v1 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-GISEL-NEXT: s_endpgm -; -; GFX11-SDAG-LABEL: v_permlane16_b32_vsv_f64: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v0 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v0, s6 -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: v_permlane16_b32_vsv_f64: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm +define amdgpu_kernel void @v_permlane16_b32_i_tid_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlane16_b32_i_tid_bc: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm ; -; GFX12-SDAG-LABEL: v_permlane16_b32_vsv_f64: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v0, s6 -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm +; GFX11-LABEL: v_permlane16_b32_i_tid_bc: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ; -; GFX12-GISEL-LABEL: v_permlane16_b32_vsv_f64: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm - %tidy = call i32 @llvm.amdgcn.workitem.id.y() - %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %src1, i32 %tidy, i1 false, i1 false) - store double %v, ptr addrspace(1) %out +; GFX12-LABEL: v_permlane16_b32_i_tid_bc: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %undef = freeze i32 poison + %v = call i32 @llvm.amdgcn.permlane16(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 true) + store i32 %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlane16_b32_vss_fi_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlane16_b32_vss_fi_i32: +define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlane16_b32_i_tid_fi_bc: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,0] +; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: v_permlane16_b32_vss_fi_i32: +; GFX11-LABEL: v_permlane16_b32_i_tid_fi_bc: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,0] -; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: v_permlane16_b32_vss_fi_i32: +; GFX12-LABEL: v_permlane16_b32_i_tid_fi_bc: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,0] -; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm - %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 false) + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %undef = freeze i32 poison + %v = call i32 @llvm.amdgcn.permlane16(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 true, i1 true) store i32 %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlane16_b32_vss_fi_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1, i32 %src2) { -; GFX10-SDAG-LABEL: v_permlane16_b32_vss_fi_i64: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlane16_b32_vss_fi_i64: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-GISEL-NEXT: s_endpgm -; -; GFX11-SDAG-LABEL: v_permlane16_b32_vss_fi_i64: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: v_permlane16_b32_vss_fi_i64: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: v_permlane16_b32_vss_fi_i64: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: v_permlane16_b32_vss_fi_i64: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm - %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 true, i1 false) - store i64 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlane16_b32_vss_fi_f32(ptr addrspace(1) %out, float %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlane16_b32_vss_fi_f32: +define amdgpu_kernel void @v_permlanex16_b32_tid_tid(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlanex16_b32_tid_tid: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,0] +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: v_permlane16_b32_vss_fi_f32: +; GFX11-LABEL: v_permlanex16_b32_tid_tid: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,0] -; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: v_permlane16_b32_vss_fi_f32: +; GFX12-LABEL: v_permlanex16_b32_tid_tid: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,0] -; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm - %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 true, i1 false) - store float %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlane16_b32_vss_fi_f64(ptr addrspace(1) %out, double %src0, i32 %src1, i32 %src2) { -; GFX10-SDAG-LABEL: v_permlane16_b32_vss_fi_f64: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlane16_b32_vss_fi_f64: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-GISEL-NEXT: s_endpgm -; -; GFX11-SDAG-LABEL: v_permlane16_b32_vss_fi_f64: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: v_permlane16_b32_vss_fi_f64: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: v_permlane16_b32_vss_fi_f64: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: v_permlane16_b32_vss_fi_f64: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm - %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 true, i1 false) - store double %v, ptr addrspace(1) %out + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %v = call i32 @llvm.amdgcn.permlanex16(i32 %tidx, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false) + store i32 %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlane16_b32_vss_bc_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlane16_b32_vss_bc_i32: +define amdgpu_kernel void @v_permlanex16_b32_undef_tid(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlanex16_b32_undef_tid: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[0,1] +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: v_permlane16_b32_vss_bc_i32: +; GFX11-LABEL: v_permlanex16_b32_undef_tid: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[0,1] -; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: v_permlane16_b32_vss_bc_i32: +; GFX12-LABEL: v_permlanex16_b32_undef_tid: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[0,1] -; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm - %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 true) + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %undef = freeze i32 poison + %v = call i32 @llvm.amdgcn.permlanex16(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false) store i32 %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlane16_b32_vss_bc_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1, i32 %src2) { -; GFX10-SDAG-LABEL: v_permlane16_b32_vss_bc_i64: +define amdgpu_kernel void @v_permlanex16_b32_i_tid(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x3039 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v0, s2, s3 +; GFX10-SDAG-NEXT: global_store_dword v2, v1, s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; -; GFX10-GISEL-LABEL: v_permlane16_b32_vss_bc_i64: +; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v0, s2, s3 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; -; GFX11-SDAG-LABEL: v_permlane16_b32_vss_bc_i64: +; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v0, s2, s3 +; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; -; GFX11-GISEL-LABEL: v_permlane16_b32_vss_bc_i64: +; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v0, s2, s3 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; -; GFX12-SDAG-LABEL: v_permlane16_b32_vss_bc_i64: +; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v0, s2, s3 +; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; -; GFX12-GISEL-LABEL: v_permlane16_b32_vss_bc_i64: +; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v0, s2, s3 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm - %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 false, i1 true) - store i64 %v, ptr addrspace(1) %out + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %v = call i32 @llvm.amdgcn.permlanex16(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false) + store i32 %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlane16_b32_vss_bc_f32(ptr addrspace(1) %out, float %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlane16_b32_vss_bc_f32: +define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlanex16_b32_i_tid_fi: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[0,1] +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: v_permlane16_b32_vss_bc_f32: +; GFX11-LABEL: v_permlanex16_b32_i_tid_fi: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[0,1] -; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: v_permlane16_b32_vss_bc_f32: +; GFX12-LABEL: v_permlanex16_b32_i_tid_fi: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[0,1] -; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm - %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 false, i1 true) - store float %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlane16_b32_vss_bc_f64(ptr addrspace(1) %out, double %src0, i32 %src1, i32 %src2) { -; GFX10-SDAG-LABEL: v_permlane16_b32_vss_bc_f64: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlane16_b32_vss_bc_f64: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-GISEL-NEXT: s_endpgm -; -; GFX11-SDAG-LABEL: v_permlane16_b32_vss_bc_f64: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: v_permlane16_b32_vss_bc_f64: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: v_permlane16_b32_vss_bc_f64: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: v_permlane16_b32_vss_bc_f64: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm - %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 false, i1 true) - store double %v, ptr addrspace(1) %out + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %undef = freeze i32 poison + %v = call i32 @llvm.amdgcn.permlanex16(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 true, i1 false) + store i32 %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlane16_b32_vss_fi_bc_i32: +define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlanex16_b32_i_tid_bc: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,1] +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: v_permlane16_b32_vss_fi_bc_i32: +; GFX11-LABEL: v_permlanex16_b32_i_tid_bc: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,1] -; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: v_permlane16_b32_vss_fi_bc_i32: +; GFX12-LABEL: v_permlanex16_b32_i_tid_bc: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,1] -; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm - %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 true) + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %undef = freeze i32 poison + %v = call i32 @llvm.amdgcn.permlanex16(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 true) store i32 %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1, i32 %src2) { -; GFX10-SDAG-LABEL: v_permlane16_b32_vss_fi_bc_i64: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlane16_b32_vss_fi_bc_i64: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-GISEL-NEXT: s_endpgm -; -; GFX11-SDAG-LABEL: v_permlane16_b32_vss_fi_bc_i64: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: v_permlane16_b32_vss_fi_bc_i64: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: v_permlane16_b32_vss_fi_bc_i64: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: v_permlane16_b32_vss_fi_bc_i64: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm - %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 true, i1 true) - store i64 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_f32(ptr addrspace(1) %out, float %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlane16_b32_vss_fi_bc_f32: +define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlanex16_b32_i_tid_fi_bc: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,1] +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: v_permlane16_b32_vss_fi_bc_f32: +; GFX11-LABEL: v_permlanex16_b32_i_tid_fi_bc: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,1] -; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: v_permlane16_b32_vss_fi_bc_f32: +; GFX12-LABEL: v_permlanex16_b32_i_tid_fi_bc: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,1] -; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm - %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 true, i1 true) - store float %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_f64(ptr addrspace(1) %out, double %src0, i32 %src1, i32 %src2) { -; GFX10-SDAG-LABEL: v_permlane16_b32_vss_fi_bc_f64: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlane16_b32_vss_fi_bc_f64: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-GISEL-NEXT: s_endpgm -; -; GFX11-SDAG-LABEL: v_permlane16_b32_vss_fi_bc_f64: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: v_permlane16_b32_vss_fi_bc_f64: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: v_permlane16_b32_vss_fi_bc_f64: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: v_permlane16_b32_vss_fi_bc_f64: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm - %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 true, i1 true) - store double %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlanex16_b32_vss_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlanex16_b32_vss_i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: v_permlanex16_b32_vss_i32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s7, s0 -; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm -; -; GFX12-LABEL: v_permlanex16_b32_vss_i32: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s7, s0 -; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm - %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) - store i32 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlanex16_b32_vss_f32(ptr addrspace(1) %out, float %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlanex16_b32_vss_f32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: v_permlanex16_b32_vss_f32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s7, s0 -; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm -; -; GFX12-LABEL: v_permlanex16_b32_vss_f32: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s7, s0 -; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm - %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 false, i1 false) - store float %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlanex16_b32_vss_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1, i32 %src2) { -; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_i64: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_i64: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-GISEL-NEXT: s_endpgm -; -; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_i64: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_i64: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_i64: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_i64: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm - %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 false, i1 false) - store i64 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlanex16_b32_vss_f64(ptr addrspace(1) %out, double %src0, i32 %src1, i32 %src2) { -; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_f64: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_f64: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-GISEL-NEXT: s_endpgm -; -; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_f64: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_f64: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_f64: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_f64: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm - %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 false, i1 false) - store double %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlanex16_b32_vii_i32(ptr addrspace(1) %out, i32 %src0) { -; GFX10-LABEL: v_permlanex16_b32_vii_i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, 1, 2 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: v_permlanex16_b32_vii_i32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, 1, 2 -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm -; -; GFX12-LABEL: v_permlanex16_b32_vii_i32: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, 1, 2 -; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm - %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 1, i32 2, i1 false, i1 false) - store i32 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlanex16_b32_vii_f32(ptr addrspace(1) %out, float %src0) { -; GFX10-LABEL: v_permlanex16_b32_vii_f32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, 1, 2 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: v_permlanex16_b32_vii_f32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, 1, 2 -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm -; -; GFX12-LABEL: v_permlanex16_b32_vii_f32: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, 1, 2 -; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm - %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 1, i32 2, i1 false, i1 false) - store float %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlanex16_b32_vii_i64(ptr addrspace(1) %out, i64 %src0) { -; GFX10-SDAG-LABEL: v_permlanex16_b32_vii_i64: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, 1, 2 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, 1, 2 -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlanex16_b32_vii_i64: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, 1, 2 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, 1, 2 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX10-GISEL-NEXT: s_endpgm -; -; GFX11-SDAG-LABEL: v_permlanex16_b32_vii_i64: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, 1, 2 -; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, 1, 2 -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: v_permlanex16_b32_vii_i64: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, 1, 2 -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, 1, 2 -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: v_permlanex16_b32_vii_i64: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, 1, 2 -; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, 1, 2 -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: v_permlanex16_b32_vii_i64: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, 1, 2 -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, 1, 2 -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm - %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 1, i32 2, i1 false, i1 false) - store i64 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlanex16_b32_vii_f64(ptr addrspace(1) %out, double %src0) { -; GFX10-SDAG-LABEL: v_permlanex16_b32_vii_f64: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, 1, 2 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, 1, 2 -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlanex16_b32_vii_f64: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, 1, 2 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, 1, 2 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX10-GISEL-NEXT: s_endpgm -; -; GFX11-SDAG-LABEL: v_permlanex16_b32_vii_f64: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, 1, 2 -; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, 1, 2 -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: v_permlanex16_b32_vii_f64: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, 1, 2 -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, 1, 2 -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: v_permlanex16_b32_vii_f64: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, 1, 2 -; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, 1, 2 -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: v_permlanex16_b32_vii_f64: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, 1, 2 -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, 1, 2 -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm - %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 1, i32 2, i1 false, i1 false) - store double %v, ptr addrspace(1) %out - ret void -} - -; FIXME-GFX10PLUS: It is allowed to have both immediates as literals -define amdgpu_kernel void @v_permlanex16_b32_vll_i32(ptr addrspace(1) %out, i32 %src0) { -; GFX10-LABEL: v_permlanex16_b32_vll_i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX10-NEXT: s_movk_i32 s2, 0x1234 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: v_permlanex16_b32_vll_i32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: s_movk_i32 s2, 0x1234 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm -; -; GFX12-LABEL: v_permlanex16_b32_vll_i32: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX12-NEXT: s_movk_i32 s2, 0x1234 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 -; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm - %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 4660, i32 49617, i1 false, i1 false) - store i32 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlanex16_b32_vll_f32(ptr addrspace(1) %out, float %src0) { -; GFX10-LABEL: v_permlanex16_b32_vll_f32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX10-NEXT: s_movk_i32 s2, 0x1234 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: v_permlanex16_b32_vll_f32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: s_movk_i32 s2, 0x1234 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm -; -; GFX12-LABEL: v_permlanex16_b32_vll_f32: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX12-NEXT: s_movk_i32 s2, 0x1234 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 -; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm - %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 4660, i32 49617, i1 false, i1 false) - store float %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlanex16_b32_vll_i64(ptr addrspace(1) %out, i64 %src0) { -; GFX10-SDAG-LABEL: v_permlanex16_b32_vll_i64: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-SDAG-NEXT: s_movk_i32 s2, 0x1234 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlanex16_b32_vll_i64: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-GISEL-NEXT: s_movk_i32 s2, 0x1234 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX10-GISEL-NEXT: s_endpgm -; -; GFX11-SDAG-LABEL: v_permlanex16_b32_vll_i64: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-SDAG-NEXT: s_movk_i32 s2, 0x1234 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: v_permlanex16_b32_vll_i64: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-GISEL-NEXT: s_movk_i32 s2, 0x1234 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1 -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: v_permlanex16_b32_vll_i64: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x1234 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: v_permlanex16_b32_vll_i64: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX12-GISEL-NEXT: s_movk_i32 s2, 0x1234 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1 -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm - %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 4660, i32 49617, i1 false, i1 false) - store i64 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlanex16_b32_vll_f64(ptr addrspace(1) %out, double %src0) { -; GFX10-SDAG-LABEL: v_permlanex16_b32_vll_f64: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-SDAG-NEXT: s_movk_i32 s2, 0x1234 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlanex16_b32_vll_f64: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-GISEL-NEXT: s_movk_i32 s2, 0x1234 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX10-GISEL-NEXT: s_endpgm -; -; GFX11-SDAG-LABEL: v_permlanex16_b32_vll_f64: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-SDAG-NEXT: s_movk_i32 s2, 0x1234 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: v_permlanex16_b32_vll_f64: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-GISEL-NEXT: s_movk_i32 s2, 0x1234 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1 -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: v_permlanex16_b32_vll_f64: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x1234 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: v_permlanex16_b32_vll_f64: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX12-GISEL-NEXT: s_movk_i32 s2, 0x1234 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1 -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm - %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 4660, i32 49617, i1 false, i1 false) - store double %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlanex16_b32_vvv_i32(ptr addrspace(1) %out, i32 %src0) { -; GFX10-LABEL: v_permlanex16_b32_vvv_i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX10-NEXT: s_mov_b32 null, 0 -; GFX10-NEXT: v_readfirstlane_b32 s2, v0 -; GFX10-NEXT: v_readfirstlane_b32 s3, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] -; GFX10-NEXT: s_endpgm -; -; GFX11-SDAG-LABEL: v_permlanex16_b32_vvv_i32: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v1 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s4 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 -; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: v_permlanex16_b32_vvv_i32: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s2, v1 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 -; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: v_permlanex16_b32_vvv_i32: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s3, v1 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 -; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: v_permlanex16_b32_vvv_i32: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s3, v1 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 -; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %tidy = call i32 @llvm.amdgcn.workitem.id.y() - %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %tidx, i32 %tidy, i1 false, i1 false) - store i32 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlanex16_b32_vvv_f32(ptr addrspace(1) %out, float %src0) { -; GFX10-LABEL: v_permlanex16_b32_vvv_f32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX10-NEXT: s_mov_b32 null, 0 -; GFX10-NEXT: v_readfirstlane_b32 s2, v0 -; GFX10-NEXT: v_readfirstlane_b32 s3, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] -; GFX10-NEXT: s_endpgm -; -; GFX11-SDAG-LABEL: v_permlanex16_b32_vvv_f32: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v1 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s4 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 -; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: v_permlanex16_b32_vvv_f32: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s2, v1 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 -; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: v_permlanex16_b32_vvv_f32: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s3, v1 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 -; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: v_permlanex16_b32_vvv_f32: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s3, v1 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 -; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %tidy = call i32 @llvm.amdgcn.workitem.id.y() - %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %tidx, i32 %tidy, i1 false, i1 false) - store float %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlanex16_b32_vvv_i64(ptr addrspace(1) %out, i64 %src0) { -; GFX10-SDAG-LABEL: v_permlanex16_b32_vvv_i64: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v0 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v1 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s4, s5 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s4, s5 -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlanex16_b32_vvv_i64: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v1 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s5 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s4, s5 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX10-GISEL-NEXT: s_endpgm -; -; GFX11-LABEL: v_permlanex16_b32_vvv_i64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX11-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_readfirstlane_b32 s5, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-NEXT: v_readfirstlane_b32 s4, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s4, s5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_permlanex16_b32 v1, v1, s4, s5 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm -; -; GFX12-LABEL: v_permlanex16_b32_vvv_i64: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX12-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_readfirstlane_b32 s5, v0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: v_readfirstlane_b32 s4, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, s3 -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s4, s5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_permlanex16_b32 v1, v1, s4, s5 -; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %tidy = call i32 @llvm.amdgcn.workitem.id.y() - %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %tidx, i32 %tidy, i1 false, i1 false) - store i64 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlanex16_b32_vvv_f64(ptr addrspace(1) %out, double %src0) { -; GFX10-SDAG-LABEL: v_permlanex16_b32_vvv_f64: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v0 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v1 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s4, s5 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s4, s5 -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlanex16_b32_vvv_f64: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v1 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s5 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s4, s5 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX10-GISEL-NEXT: s_endpgm -; -; GFX11-LABEL: v_permlanex16_b32_vvv_f64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX11-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_readfirstlane_b32 s5, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-NEXT: v_readfirstlane_b32 s4, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s4, s5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_permlanex16_b32 v1, v1, s4, s5 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm -; -; GFX12-LABEL: v_permlanex16_b32_vvv_f64: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX12-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_readfirstlane_b32 s5, v0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: v_readfirstlane_b32 s4, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, s3 -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s4, s5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_permlanex16_b32 v1, v1, s4, s5 -; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %tidy = call i32 @llvm.amdgcn.workitem.id.y() - %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %tidx, i32 %tidy, i1 false, i1 false) - store double %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlanex16_b32_vvs_i32(ptr addrspace(1) %out, i32 %src0, i32 %src2) { -; GFX10-SDAG-LABEL: v_permlanex16_b32_vvs_i32: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 -; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlanex16_b32_vvs_i32: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s3 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] -; GFX10-GISEL-NEXT: s_endpgm -; -; GFX11-SDAG-LABEL: v_permlanex16_b32_vvs_i32: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 -; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: v_permlanex16_b32_vvs_i32: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s3 -; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: v_permlanex16_b32_vvs_i32: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 -; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: v_permlanex16_b32_vvs_i32: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s3 -; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %tidx, i32 %src2, i1 false, i1 false) - store i32 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlanex16_b32_vvs_f32(ptr addrspace(1) %out, float %src0, i32 %src2) { -; GFX10-SDAG-LABEL: v_permlanex16_b32_vvs_f32: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 -; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlanex16_b32_vvs_f32: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s3 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] -; GFX10-GISEL-NEXT: s_endpgm -; -; GFX11-SDAG-LABEL: v_permlanex16_b32_vvs_f32: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 -; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: v_permlanex16_b32_vvs_f32: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s3 -; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: v_permlanex16_b32_vvs_f32: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 -; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: v_permlanex16_b32_vvs_f32: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s3 -; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %tidx, i32 %src2, i1 false, i1 false) - store float %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlanex16_b32_vvs_i64(ptr addrspace(1) %out, i64 %src0, i32 %src2) { -; GFX10-SDAG-LABEL: v_permlanex16_b32_vvs_i64: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s1, v0 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s1, s0 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s1, s0 -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlanex16_b32_vvs_i64: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s1, v0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s1, s0 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s1, s0 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-GISEL-NEXT: s_endpgm -; -; GFX11-LABEL: v_permlanex16_b32_vvs_i64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_readfirstlane_b32 s1, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-NEXT: v_permlanex16_b32 v1, v1, s1, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s1, s0 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm -; -; GFX12-LABEL: v_permlanex16_b32_vvs_i64: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_readfirstlane_b32 s1, v0 -; GFX12-NEXT: v_mov_b32_e32 v0, s6 -; GFX12-NEXT: v_permlanex16_b32 v1, v1, s1, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s1, s0 -; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %tidx, i32 %src2, i1 false, i1 false) - store i64 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlanex16_b32_vvs_f64(ptr addrspace(1) %out, double %src0, i32 %src2) { -; GFX10-SDAG-LABEL: v_permlanex16_b32_vvs_f64: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s1, v0 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s1, s0 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s1, s0 -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlanex16_b32_vvs_f64: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s1, v0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s1, s0 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s1, s0 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-GISEL-NEXT: s_endpgm -; -; GFX11-LABEL: v_permlanex16_b32_vvs_f64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_readfirstlane_b32 s1, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-NEXT: v_permlanex16_b32 v1, v1, s1, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s1, s0 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm -; -; GFX12-LABEL: v_permlanex16_b32_vvs_f64: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_readfirstlane_b32 s1, v0 -; GFX12-NEXT: v_mov_b32_e32 v0, s6 -; GFX12-NEXT: v_permlanex16_b32 v1, v1, s1, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s1, s0 -; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %tidx, i32 %src2, i1 false, i1 false) - store double %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlanex16_b32_vsv_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1) { -; GFX10-SDAG-LABEL: v_permlanex16_b32_vsv_i32: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v1 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s3, s2 -; GFX10-SDAG-NEXT: global_store_dword v1, v0, s[0:1] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlanex16_b32_vsv_i32: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v1 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] -; GFX10-GISEL-NEXT: s_endpgm -; -; GFX11-SDAG-LABEL: v_permlanex16_b32_vsv_i32: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 -; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: v_permlanex16_b32_vsv_i32: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 -; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: v_permlanex16_b32_vsv_i32: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 -; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: v_permlanex16_b32_vsv_i32: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 -; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm - %tidy = call i32 @llvm.amdgcn.workitem.id.y() - %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %tidy, i1 false, i1 false) - store i32 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlanex16_b32_vsv_f32(ptr addrspace(1) %out, float %src0, i32 %src1) { -; GFX10-SDAG-LABEL: v_permlanex16_b32_vsv_f32: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v1 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s3, s2 -; GFX10-SDAG-NEXT: global_store_dword v1, v0, s[0:1] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlanex16_b32_vsv_f32: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v1 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] -; GFX10-GISEL-NEXT: s_endpgm -; -; GFX11-SDAG-LABEL: v_permlanex16_b32_vsv_f32: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 -; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: v_permlanex16_b32_vsv_f32: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 -; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: v_permlanex16_b32_vsv_f32: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 -; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: v_permlanex16_b32_vsv_f32: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 -; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm - %tidy = call i32 @llvm.amdgcn.workitem.id.y() - %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %src1, i32 %tidy, i1 false, i1 false) - store float %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlanex16_b32_vsv_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1) { -; GFX10-SDAG-LABEL: v_permlanex16_b32_vsv_i64: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s1, v1 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlanex16_b32_vsv_i64: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s1, v1 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-GISEL-NEXT: s_endpgm -; -; GFX11-SDAG-LABEL: v_permlanex16_b32_vsv_i64: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v0 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v0, s6 -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: v_permlanex16_b32_vsv_i64: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: v_permlanex16_b32_vsv_i64: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v0, s6 -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: v_permlanex16_b32_vsv_i64: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm - %tidy = call i32 @llvm.amdgcn.workitem.id.y() - %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %tidy, i1 false, i1 false) - store i64 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlanex16_b32_vsv_f64(ptr addrspace(1) %out, double %src0, i32 %src1) { -; GFX10-SDAG-LABEL: v_permlanex16_b32_vsv_f64: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s1, v1 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlanex16_b32_vsv_f64: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s1, v1 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-GISEL-NEXT: s_endpgm -; -; GFX11-SDAG-LABEL: v_permlanex16_b32_vsv_f64: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v0 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v0, s6 -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: v_permlanex16_b32_vsv_f64: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: v_permlanex16_b32_vsv_f64: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v0, s6 -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: v_permlanex16_b32_vsv_f64: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm - %tidy = call i32 @llvm.amdgcn.workitem.id.y() - %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %src1, i32 %tidy, i1 false, i1 false) - store double %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlanex16_b32_vss_fi_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlanex16_b32_vss_fi_i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,0] -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: v_permlanex16_b32_vss_fi_i32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,0] -; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm -; -; GFX12-LABEL: v_permlanex16_b32_vss_fi_i32: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,0] -; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm - %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 false) - store i32 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlanex16_b32_vss_fi_f32(ptr addrspace(1) %out, float %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlanex16_b32_vss_fi_f32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,0] -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: v_permlanex16_b32_vss_fi_f32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,0] -; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm -; -; GFX12-LABEL: v_permlanex16_b32_vss_fi_f32: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,0] -; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm - %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 true, i1 false) - store float %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlanex16_b32_vss_fi_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1, i32 %src2) { -; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_fi_i64: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_fi_i64: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-GISEL-NEXT: s_endpgm -; -; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_fi_i64: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_fi_i64: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_fi_i64: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_fi_i64: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm - %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 true, i1 false) - store i64 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlanex16_b32_vss_fi_f64(ptr addrspace(1) %out, double %src0, i32 %src1, i32 %src2) { -; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_fi_f64: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_fi_f64: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-GISEL-NEXT: s_endpgm -; -; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_fi_f64: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_fi_f64: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_fi_f64: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_fi_f64: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm - %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 true, i1 false) - store double %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlanex16_b32_vss_bc_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlanex16_b32_vss_bc_i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[0,1] -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: v_permlanex16_b32_vss_bc_i32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[0,1] -; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm -; -; GFX12-LABEL: v_permlanex16_b32_vss_bc_i32: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[0,1] -; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm - %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 true) - store i32 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlanex16_b32_vss_bc_f32(ptr addrspace(1) %out, float %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlanex16_b32_vss_bc_f32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[0,1] -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: v_permlanex16_b32_vss_bc_f32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[0,1] -; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm -; -; GFX12-LABEL: v_permlanex16_b32_vss_bc_f32: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[0,1] -; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm - %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 false, i1 true) - store float %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlanex16_b32_vss_bc_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1, i32 %src2) { -; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_bc_i64: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_bc_i64: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-GISEL-NEXT: s_endpgm -; -; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_bc_i64: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_bc_i64: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_bc_i64: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_bc_i64: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm - %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 false, i1 true) - store i64 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlanex16_b32_vss_bc_f64(ptr addrspace(1) %out, double %src0, i32 %src1, i32 %src2) { -; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_bc_f64: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_bc_f64: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-GISEL-NEXT: s_endpgm -; -; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_bc_f64: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_bc_f64: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_bc_f64: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_bc_f64: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm - %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 false, i1 true) - store double %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlanex16_b32_vss_fi_bc_i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,1] -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: v_permlanex16_b32_vss_fi_bc_i32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,1] -; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm -; -; GFX12-LABEL: v_permlanex16_b32_vss_fi_bc_i32: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,1] -; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm - %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 true) - store i32 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_f32(ptr addrspace(1) %out, float %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlanex16_b32_vss_fi_bc_f32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,1] -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: v_permlanex16_b32_vss_fi_bc_f32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,1] -; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm -; -; GFX12-LABEL: v_permlanex16_b32_vss_fi_bc_f32: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,1] -; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm - %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 true, i1 true) - store float %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1, i32 %src2) { -; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_fi_bc_i64: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_fi_bc_i64: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-GISEL-NEXT: s_endpgm -; -; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_fi_bc_i64: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_fi_bc_i64: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_fi_bc_i64: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_fi_bc_i64: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm - %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 true, i1 true) - store i64 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_f64(ptr addrspace(1) %out, double %src0, i32 %src1, i32 %src2) { -; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_fi_bc_f64: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_fi_bc_f64: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-GISEL-NEXT: s_endpgm -; -; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_fi_bc_f64: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_fi_bc_f64: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_fi_bc_f64: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_fi_bc_f64: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm - %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 true, i1 true) - store double %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlane16_b32_tid_tid_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlane16_b32_tid_tid_i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: v_permlane16_b32_tid_tid_i32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm -; -; GFX12-LABEL: v_permlane16_b32_tid_tid_i32: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %tidx, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false) - store i32 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlane16_b32_tid_tid_f32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlane16_b32_tid_tid_f32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: v_permlane16_b32_tid_tid_f32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm -; -; GFX12-LABEL: v_permlane16_b32_tid_tid_f32: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %tidx_f32 = bitcast i32 %tidx to float - %v = call float @llvm.amdgcn.permlane16.f32(float %tidx_f32, float %tidx_f32, i32 %src1, i32 %src2, i1 false, i1 false) - store float %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlane16_b32_tid_tid_i64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-SDAG-LABEL: v_permlane16_b32_tid_tid_i64: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlane16_b32_tid_tid_i64: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-GISEL-NEXT: s_endpgm -; -; GFX11-SDAG-LABEL: v_permlane16_b32_tid_tid_i64: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: v_permlane16_b32_tid_tid_i64: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: v_permlane16_b32_tid_tid_i64: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: v_permlane16_b32_tid_tid_i64: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %tidx_i64 = zext i32 %tidx to i64 - %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %tidx_i64, i64 %tidx_i64, i32 %src1, i32 %src2, i1 false, i1 false) - store i64 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlane16_b32_tid_tid_f64(ptr addrspace(1) %out, float %src0, i32 %src1, i32 %src2) { -; GFX10-SDAG-LABEL: v_permlane16_b32_tid_tid_f64: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlane16_b32_tid_tid_f64: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-GISEL-NEXT: s_endpgm -; -; GFX11-SDAG-LABEL: v_permlane16_b32_tid_tid_f64: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: v_permlane16_b32_tid_tid_f64: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: v_permlane16_b32_tid_tid_f64: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: v_permlane16_b32_tid_tid_f64: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %tidx_f32 = bitcast i32 %tidx to float - %tidx_f64 = fpext float %tidx_f32 to double - %v = call double @llvm.amdgcn.permlane16.f64(double %tidx_f64, double %tidx_f64, i32 %src1, i32 %src2, i1 false, i1 false) - store double %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlane16_b32_undef_tid_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlane16_b32_undef_tid_i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: v_permlane16_b32_undef_tid_i32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm -; -; GFX12-LABEL: v_permlane16_b32_undef_tid_i32: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %undef = freeze i32 poison - %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false) - store i32 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlane16_b32_undef_tid_f32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlane16_b32_undef_tid_f32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: v_permlane16_b32_undef_tid_f32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm -; -; GFX12-LABEL: v_permlane16_b32_undef_tid_f32: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %tidx_f32 = bitcast i32 %tidx to float - %undef = freeze float poison - %v = call float @llvm.amdgcn.permlane16.f32(float %undef, float %tidx_f32, i32 %src1, i32 %src2, i1 false, i1 false) - store float %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlane16_b32_undef_tid_i64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-SDAG-LABEL: v_permlane16_b32_undef_tid_i64: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlane16_b32_undef_tid_i64: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-GISEL-NEXT: s_endpgm -; -; GFX11-SDAG-LABEL: v_permlane16_b32_undef_tid_i64: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 -; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: v_permlane16_b32_undef_tid_i64: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: v_permlane16_b32_undef_tid_i64: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 -; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: v_permlane16_b32_undef_tid_i64: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %tidx_i64 = zext i32 %tidx to i64 - %undef = freeze i64 poison - %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %undef, i64 %tidx_i64, i32 %src1, i32 %src2, i1 false, i1 false) - store i64 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlane16_b32_undef_tid_f64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-SDAG-LABEL: v_permlane16_b32_undef_tid_f64: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlane16_b32_undef_tid_f64: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-GISEL-NEXT: s_endpgm -; -; GFX11-SDAG-LABEL: v_permlane16_b32_undef_tid_f64: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: v_permlane16_b32_undef_tid_f64: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: v_permlane16_b32_undef_tid_f64: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: v_permlane16_b32_undef_tid_f64: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %tidx_f32 = bitcast i32 %tidx to float - %tidx_f64 = fpext float %tidx_f32 to double - %undef = freeze double poison - %v = call double @llvm.amdgcn.permlane16.f64(double %undef, double %tidx_f64, i32 %src1, i32 %src2, i1 false, i1 false) - store double %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlane16_b32_i_tid_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_i32: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x3039 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v0, s0, s1 -; GFX10-SDAG-NEXT: global_store_dword v2, v1, s[4:5] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_i32: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v0, s0, s1 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5] -; GFX10-GISEL-NEXT: s_endpgm -; -; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_i32: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v0, s0, s1 -; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[2:3] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_i32: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v0, s0, s1 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[2:3] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_i32: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v0, s0, s1 -; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[2:3] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_i32: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v0, s0, s1 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[2:3] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %v = call i32 @llvm.amdgcn.permlane16.i32(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false) - store i32 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlane16_b32_i_tid_f32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_f32: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x449a5000 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v0, s0, s1 -; GFX10-SDAG-NEXT: global_store_dword v2, v1, s[4:5] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_f32: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x449a5000 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v0, s0, s1 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5] -; GFX10-GISEL-NEXT: s_endpgm -; -; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_f32: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0x449a5000 :: v_dual_mov_b32 v2, 0 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v0, s0, s1 -; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[2:3] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_f32: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x449a5000 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v0, s0, s1 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[2:3] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_f32: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x449a5000 :: v_dual_mov_b32 v2, 0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v0, s0, s1 -; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[2:3] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_f32: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x449a5000 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v0, s0, s1 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[2:3] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %tidx_f32 = bitcast i32 %tidx to float - %v = call float @llvm.amdgcn.permlane16.f32(float 1234.5, float %tidx_f32, i32 %src1, i32 %src2, i1 false, i1 false) - store float %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlane16_b32_i_tid_i64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_i64: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x3039 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v0, s0, s1 -; GFX10-SDAG-NEXT: global_store_dwordx2 v3, v[1:2], s[4:5] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_i64: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 -; GFX10-GISEL-NEXT: global_store_dwordx2 v3, v[1:2], s[4:5] -; GFX10-GISEL-NEXT: s_endpgm -; -; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_i64: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x3039 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v0, s0, s1 -; GFX11-SDAG-NEXT: global_store_b64 v3, v[1:2], s[2:3] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_i64: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v0, s0, s1 -; GFX11-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 -; GFX11-GISEL-NEXT: global_store_b64 v3, v[1:2], s[2:3] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_i64: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x3039 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v0, s0, s1 -; GFX12-SDAG-NEXT: global_store_b64 v3, v[1:2], s[2:3] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_i64: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v0, s0, s1 -; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 -; GFX12-GISEL-NEXT: global_store_b64 v3, v[1:2], s[2:3] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %tidx_i64 = zext i32 %tidx to i64 - %v = call i64 @llvm.amdgcn.permlane16.i64(i64 12345, i64 %tidx_i64, i32 %src1, i32 %src2, i1 false, i1 false) - store i64 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlane16_b32_i_tid_f64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_f64: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0x40934a00 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlane16_b32 v3, v1, s0, s1 -; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v0, s0, s1 -; GFX10-SDAG-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_f64: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0x40934a00 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlane16_b32 v2, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlane16_b32 v3, v1, s0, s1 -; GFX10-GISEL-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5] -; GFX10-GISEL-NEXT: s_endpgm -; -; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_f64: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0x40934a00 :: v_dual_mov_b32 v2, 0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 0 -; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane16_b32 v3, v1, s0, s1 -; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v0, s0, s1 -; GFX11-SDAG-NEXT: global_store_b64 v4, v[2:3], s[2:3] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_f64: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40934a00 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, 0 -; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane16_b32 v2, v0, s0, s1 -; GFX11-GISEL-NEXT: v_permlane16_b32 v3, v1, s0, s1 -; GFX11-GISEL-NEXT: global_store_b64 v4, v[2:3], s[2:3] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_f64: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0x40934a00 :: v_dual_mov_b32 v2, 0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v1, s0, s1 -; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v0, s0, s1 -; GFX12-SDAG-NEXT: global_store_b64 v4, v[2:3], s[2:3] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_f64: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40934a00 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v0, s0, s1 -; GFX12-GISEL-NEXT: v_permlane16_b32 v3, v1, s0, s1 -; GFX12-GISEL-NEXT: global_store_b64 v4, v[2:3], s[2:3] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %tidx_f32 = bitcast i32 %tidx to float - %tidx_f64 = fpext float %tidx_f32 to double - %v = call double @llvm.amdgcn.permlane16.f64(double 1234.5, double %tidx_f64, i32 %src1, i32 %src2, i1 false, i1 false) - store double %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlane16_b32_i_tid_fi_i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: v_permlane16_b32_i_tid_fi_i32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm -; -; GFX12-LABEL: v_permlane16_b32_i_tid_fi_i32: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %undef = freeze i32 poison - %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 true, i1 false) - store i32 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_f32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlane16_b32_i_tid_fi_f32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: v_permlane16_b32_i_tid_fi_f32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm -; -; GFX12-LABEL: v_permlane16_b32_i_tid_fi_f32: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %tidx_f32 = bitcast i32 %tidx to float - %undef = freeze float poison - %v = call float @llvm.amdgcn.permlane16.f32(float %undef, float %tidx_f32, i32 %src1, i32 %src2, i1 true, i1 false) - store float %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_i64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_fi_i64: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,0] -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_fi_i64: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,0] -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-GISEL-NEXT: s_endpgm -; -; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_fi_i64: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,0] -; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_fi_i64: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,0] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_fi_i64: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,0] -; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_fi_i64: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,0] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %tidx_i64 = zext i32 %tidx to i64 - %undef = freeze i64 poison - %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %undef, i64 %tidx_i64, i32 %src1, i32 %src2, i1 true, i1 false) - store i64 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_f64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_fi_f64: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_fi_f64: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-GISEL-NEXT: s_endpgm -; -; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_fi_f64: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_fi_f64: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_fi_f64: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_fi_f64: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %tidx_f32 = bitcast i32 %tidx to float - %tidx_f64 = fpext float %tidx_f32 to double - %undef = freeze double poison - %v = call double @llvm.amdgcn.permlane16.f64(double %undef, double %tidx_f64, i32 %src1, i32 %src2, i1 true, i1 false) - store double %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlane16_b32_i_tid_bc_i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: v_permlane16_b32_i_tid_bc_i32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm -; -; GFX12-LABEL: v_permlane16_b32_i_tid_bc_i32: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %undef = freeze i32 poison - %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 true) - store i32 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_f32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlane16_b32_i_tid_bc_f32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: v_permlane16_b32_i_tid_bc_f32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm -; -; GFX12-LABEL: v_permlane16_b32_i_tid_bc_f32: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %tidx_f32 = bitcast i32 %tidx to float - %undef = freeze float poison - %v = call float @llvm.amdgcn.permlane16.f32(float %undef, float %tidx_f32, i32 %src1, i32 %src2, i1 false, i1 true) - store float %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_i64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_bc_i64: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[0,1] -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_bc_i64: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[0,1] -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-GISEL-NEXT: s_endpgm -; -; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_bc_i64: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[0,1] -; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_bc_i64: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[0,1] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_bc_i64: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[0,1] -; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_bc_i64: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[0,1] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %tidx_i64 = zext i32 %tidx to i64 - %undef = freeze i64 poison - %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %undef, i64 %tidx_i64, i32 %src1, i32 %src2, i1 false, i1 true) - store i64 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_f64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_bc_f64: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_bc_f64: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-GISEL-NEXT: s_endpgm -; -; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_bc_f64: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_bc_f64: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_bc_f64: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_bc_f64: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %tidx_f32 = bitcast i32 %tidx to float - %tidx_f64 = fpext float %tidx_f32 to double - %undef = freeze double poison - %v = call double @llvm.amdgcn.permlane16.f64(double %undef, double %tidx_f64, i32 %src1, i32 %src2, i1 false, i1 true) - store double %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlane16_b32_i_tid_fi_bc_i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: v_permlane16_b32_i_tid_fi_bc_i32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm -; -; GFX12-LABEL: v_permlane16_b32_i_tid_fi_bc_i32: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %undef = freeze i32 poison - %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 true, i1 true) - store i32 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_f32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlane16_b32_i_tid_fi_bc_f32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: v_permlane16_b32_i_tid_fi_bc_f32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm -; -; GFX12-LABEL: v_permlane16_b32_i_tid_fi_bc_f32: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %tidx_f32 = bitcast i32 %tidx to float - %undef = freeze float poison - %v = call float @llvm.amdgcn.permlane16.f32(float %undef, float %tidx_f32, i32 %src1, i32 %src2, i1 true, i1 true) - store float %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_i64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_fi_bc_i64: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,1] -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_fi_bc_i64: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,1] -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-GISEL-NEXT: s_endpgm -; -; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_fi_bc_i64: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,1] -; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_fi_bc_i64: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,1] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_fi_bc_i64: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,1] -; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_fi_bc_i64: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,1] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %tidx_i64 = zext i32 %tidx to i64 - %undef = freeze i64 poison - %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %undef, i64 %tidx_i64, i32 %src1, i32 %src2, i1 true, i1 true) - store i64 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_f64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_fi_bc_f64: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_fi_bc_f64: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-GISEL-NEXT: s_endpgm -; -; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_fi_bc_f64: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_fi_bc_f64: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_fi_bc_f64: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_fi_bc_f64: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %tidx_f32 = bitcast i32 %tidx to float - %tidx_f64 = fpext float %tidx_f32 to double - %undef = freeze double poison - %v = call double @llvm.amdgcn.permlane16.f64(double %undef, double %tidx_f64, i32 %src1, i32 %src2, i1 true, i1 true) - store double %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlanex16_b32_tid_tid_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlanex16_b32_tid_tid_i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: v_permlanex16_b32_tid_tid_i32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm -; -; GFX12-LABEL: v_permlanex16_b32_tid_tid_i32: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %tidx, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false) - store i32 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlanex16_b32_tid_tid_f32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlanex16_b32_tid_tid_f32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: v_permlanex16_b32_tid_tid_f32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm -; -; GFX12-LABEL: v_permlanex16_b32_tid_tid_f32: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %tidx_f32 = bitcast i32 %tidx to float - %v = call float @llvm.amdgcn.permlanex16.f32(float %tidx_f32, float %tidx_f32, i32 %src1, i32 %src2, i1 false, i1 false) - store float %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlanex16_b32_tid_tid_i64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-SDAG-LABEL: v_permlanex16_b32_tid_tid_i64: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlanex16_b32_tid_tid_i64: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-GISEL-NEXT: s_endpgm -; -; GFX11-SDAG-LABEL: v_permlanex16_b32_tid_tid_i64: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: v_permlanex16_b32_tid_tid_i64: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: v_permlanex16_b32_tid_tid_i64: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: v_permlanex16_b32_tid_tid_i64: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %tidx_i64 = zext i32 %tidx to i64 - %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %tidx_i64, i64 %tidx_i64, i32 %src1, i32 %src2, i1 false, i1 false) - store i64 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlanex16_b32_tid_tid_f64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-SDAG-LABEL: v_permlanex16_b32_tid_tid_f64: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlanex16_b32_tid_tid_f64: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-GISEL-NEXT: s_endpgm -; -; GFX11-SDAG-LABEL: v_permlanex16_b32_tid_tid_f64: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: v_permlanex16_b32_tid_tid_f64: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: v_permlanex16_b32_tid_tid_f64: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: v_permlanex16_b32_tid_tid_f64: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %tidx_f32 = bitcast i32 %tidx to float - %tidx_f64 = fpext float %tidx_f32 to double - %v = call double @llvm.amdgcn.permlanex16.f64(double %tidx_f64, double %tidx_f64, i32 %src1, i32 %src2, i1 false, i1 false) - store double %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlanex16_b32_undef_tid_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlanex16_b32_undef_tid_i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: v_permlanex16_b32_undef_tid_i32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm -; -; GFX12-LABEL: v_permlanex16_b32_undef_tid_i32: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %undef = freeze i32 poison - %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false) - store i32 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlanex16_b32_undef_tid_f32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlanex16_b32_undef_tid_f32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: v_permlanex16_b32_undef_tid_f32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm -; -; GFX12-LABEL: v_permlanex16_b32_undef_tid_f32: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %tidx_f32 = bitcast i32 %tidx to float - %undef = freeze float poison - %v = call float @llvm.amdgcn.permlanex16.f32(float %undef, float %tidx_f32, i32 %src1, i32 %src2, i1 false, i1 false) - store float %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlanex16_b32_undef_tid_i64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-SDAG-LABEL: v_permlanex16_b32_undef_tid_i64: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlanex16_b32_undef_tid_i64: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-GISEL-NEXT: s_endpgm -; -; GFX11-SDAG-LABEL: v_permlanex16_b32_undef_tid_i64: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 -; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: v_permlanex16_b32_undef_tid_i64: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: v_permlanex16_b32_undef_tid_i64: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 -; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: v_permlanex16_b32_undef_tid_i64: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %tidx_i64 = zext i32 %tidx to i64 - %undef = freeze i64 poison - %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %undef, i64 %tidx_i64, i32 %src1, i32 %src2, i1 false, i1 false) - store i64 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlanex16_b32_undef_tid_f64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-SDAG-LABEL: v_permlanex16_b32_undef_tid_f64: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlanex16_b32_undef_tid_f64: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-GISEL-NEXT: s_endpgm -; -; GFX11-SDAG-LABEL: v_permlanex16_b32_undef_tid_f64: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: v_permlanex16_b32_undef_tid_f64: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: v_permlanex16_b32_undef_tid_f64: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: v_permlanex16_b32_undef_tid_f64: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %tidx_f32 = bitcast i32 %tidx to float - %tidx_f64 = fpext float %tidx_f32 to double - %undef = freeze double poison - %v = call double @llvm.amdgcn.permlanex16.f64(double %undef, double %tidx_f64, i32 %src1, i32 %src2, i1 false, i1 false) - store double %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlanex16_b32_i_tid_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_i32: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x3039 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v0, s0, s1 -; GFX10-SDAG-NEXT: global_store_dword v2, v1, s[4:5] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_i32: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v0, s0, s1 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5] -; GFX10-GISEL-NEXT: s_endpgm -; -; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_i32: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v0, s0, s1 -; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[2:3] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_i32: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v0, s0, s1 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[2:3] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_i32: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v0, s0, s1 -; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[2:3] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_i32: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v0, s0, s1 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[2:3] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false) - store i32 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlanex16_b32_i_tid_f32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_f32: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x449a5000 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v0, s0, s1 -; GFX10-SDAG-NEXT: global_store_dword v2, v1, s[4:5] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_f32: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x449a5000 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v0, s0, s1 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5] -; GFX10-GISEL-NEXT: s_endpgm -; -; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_f32: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0x449a5000 :: v_dual_mov_b32 v2, 0 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v0, s0, s1 -; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[2:3] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_f32: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x449a5000 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v0, s0, s1 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[2:3] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_f32: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x449a5000 :: v_dual_mov_b32 v2, 0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v0, s0, s1 -; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[2:3] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_f32: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x449a5000 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v0, s0, s1 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[2:3] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %tidx_f32 = bitcast i32 %tidx to float - %v = call float @llvm.amdgcn.permlanex16.f32(float 1234.5, float %tidx_f32, i32 %src1, i32 %src2, i1 false, i1 false) - store float %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlanex16_b32_i_tid_i64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_i64: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x3039 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v0, s0, s1 -; GFX10-SDAG-NEXT: global_store_dwordx2 v3, v[1:2], s[4:5] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_i64: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 -; GFX10-GISEL-NEXT: global_store_dwordx2 v3, v[1:2], s[4:5] -; GFX10-GISEL-NEXT: s_endpgm -; -; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_i64: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x3039 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v0, s0, s1 -; GFX11-SDAG-NEXT: global_store_b64 v3, v[1:2], s[2:3] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_i64: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v0, s0, s1 -; GFX11-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 -; GFX11-GISEL-NEXT: global_store_b64 v3, v[1:2], s[2:3] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_i64: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x3039 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v0, s0, s1 -; GFX12-SDAG-NEXT: global_store_b64 v3, v[1:2], s[2:3] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_i64: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v0, s0, s1 -; GFX12-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 -; GFX12-GISEL-NEXT: global_store_b64 v3, v[1:2], s[2:3] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %tidx_i64 = zext i32 %tidx to i64 - %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 12345, i64 %tidx_i64, i32 %src1, i32 %src2, i1 false, i1 false) - store i64 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlanex16_b32_i_tid_f64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_f64: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0x40934a00 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlanex16_b32 v3, v1, s0, s1 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v0, s0, s1 -; GFX10-SDAG-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_f64: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0x40934a00 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlanex16_b32 v2, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v3, v1, s0, s1 -; GFX10-GISEL-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5] -; GFX10-GISEL-NEXT: s_endpgm -; -; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_f64: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0x40934a00 :: v_dual_mov_b32 v2, 0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 0 -; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v3, v1, s0, s1 -; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v0, s0, s1 -; GFX11-SDAG-NEXT: global_store_b64 v4, v[2:3], s[2:3] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_f64: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40934a00 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, 0 -; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v2, v0, s0, s1 -; GFX11-GISEL-NEXT: v_permlanex16_b32 v3, v1, s0, s1 -; GFX11-GISEL-NEXT: global_store_b64 v4, v[2:3], s[2:3] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_f64: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0x40934a00 :: v_dual_mov_b32 v2, 0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v1, s0, s1 -; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v0, s0, s1 -; GFX12-SDAG-NEXT: global_store_b64 v4, v[2:3], s[2:3] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_f64: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40934a00 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v2, v0, s0, s1 -; GFX12-GISEL-NEXT: v_permlanex16_b32 v3, v1, s0, s1 -; GFX12-GISEL-NEXT: global_store_b64 v4, v[2:3], s[2:3] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %tidx_f32 = bitcast i32 %tidx to float - %tidx_f64 = fpext float %tidx_f32 to double - %v = call double @llvm.amdgcn.permlanex16.f64(double 1234.5, double %tidx_f64, i32 %src1, i32 %src2, i1 false, i1 false) - store double %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlanex16_b32_i_tid_fi_i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: v_permlanex16_b32_i_tid_fi_i32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm -; -; GFX12-LABEL: v_permlanex16_b32_i_tid_fi_i32: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %undef = freeze i32 poison - %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 true, i1 false) - store i32 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_f32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlanex16_b32_i_tid_fi_f32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: v_permlanex16_b32_i_tid_fi_f32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm -; -; GFX12-LABEL: v_permlanex16_b32_i_tid_fi_f32: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %tidx_f32 = bitcast i32 %tidx to float - %undef = freeze float poison - %v = call float @llvm.amdgcn.permlanex16.f32(float %undef, float %tidx_f32, i32 %src1, i32 %src2, i1 true, i1 false) - store float %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_i64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_i64: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,0] -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_i64: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,0] -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-GISEL-NEXT: s_endpgm -; -; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_i64: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,0] -; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_i64: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,0] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_i64: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,0] -; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_i64: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,0] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %tidx_i64 = zext i32 %tidx to i64 - %undef = freeze i64 poison - %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %undef, i64 %tidx_i64, i32 %src1, i32 %src2, i1 true, i1 false) - store i64 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_f64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_f64: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_f64: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-GISEL-NEXT: s_endpgm -; -; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_f64: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_f64: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_f64: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_f64: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %tidx_f32 = bitcast i32 %tidx to float - %tidx_f64 = fpext float %tidx_f32 to double - %undef = freeze double poison - %v = call double @llvm.amdgcn.permlanex16.f64(double %undef, double %tidx_f64, i32 %src1, i32 %src2, i1 true, i1 false) - store double %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlanex16_b32_i_tid_bc_i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: v_permlanex16_b32_i_tid_bc_i32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm -; -; GFX12-LABEL: v_permlanex16_b32_i_tid_bc_i32: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %undef = freeze i32 poison - %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 true) - store i32 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_f32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlanex16_b32_i_tid_bc_f32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: v_permlanex16_b32_i_tid_bc_f32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm -; -; GFX12-LABEL: v_permlanex16_b32_i_tid_bc_f32: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %tidx_f32 = bitcast i32 %tidx to float - %undef = freeze float poison - %v = call float @llvm.amdgcn.permlanex16.f32(float %undef, float %tidx_f32, i32 %src1, i32 %src2, i1 false, i1 true) - store float %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_i64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_bc_i64: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[0,1] -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_bc_i64: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[0,1] -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-GISEL-NEXT: s_endpgm -; -; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_bc_i64: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[0,1] -; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_bc_i64: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[0,1] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_bc_i64: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[0,1] -; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_bc_i64: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[0,1] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %tidx_i64 = zext i32 %tidx to i64 - %undef = freeze i64 poison - %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %undef, i64 %tidx_i64, i32 %src1, i32 %src2, i1 false, i1 true) - store i64 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_f64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_bc_f64: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_bc_f64: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-GISEL-NEXT: s_endpgm -; -; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_bc_f64: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_bc_f64: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_bc_f64: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_bc_f64: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %tidx_f32 = bitcast i32 %tidx to float - %tidx_f64 = fpext float %tidx_f32 to double - %undef = freeze double poison - %v = call double @llvm.amdgcn.permlanex16.f64(double %undef, double %tidx_f64, i32 %src1, i32 %src2, i1 false, i1 true) - store double %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlanex16_b32_i_tid_fi_bc_i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: v_permlanex16_b32_i_tid_fi_bc_i32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm -; -; GFX12-LABEL: v_permlanex16_b32_i_tid_fi_bc_i32: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %undef = freeze i32 poison - %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 true, i1 true) - store i32 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_f32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlanex16_b32_i_tid_fi_bc_f32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: v_permlanex16_b32_i_tid_fi_bc_f32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm -; -; GFX12-LABEL: v_permlanex16_b32_i_tid_fi_bc_f32: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %tidx_f32 = bitcast i32 %tidx to float - %undef = freeze float poison - %v = call float @llvm.amdgcn.permlanex16.f32(float %undef, float %tidx_f32, i32 %src1, i32 %src2, i1 true, i1 true) - store float %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_i64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_bc_i64: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,1] -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_bc_i64: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,1] -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-GISEL-NEXT: s_endpgm -; -; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_bc_i64: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,1] -; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_bc_i64: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,1] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_bc_i64: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,1] -; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_bc_i64: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,1] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %tidx_i64 = zext i32 %tidx to i64 - %undef = freeze i64 poison - %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %undef, i64 %tidx_i64, i32 %src1, i32 %src2, i1 true, i1 true) - store i64 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_f64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_bc_f64: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_bc_f64: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-GISEL-NEXT: s_endpgm -; -; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_bc_f64: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_bc_f64: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_bc_f64: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_bc_f64: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %tidx_f32 = bitcast i32 %tidx to float - %tidx_f64 = fpext float %tidx_f32 to double - %undef = freeze double poison - %v = call double @llvm.amdgcn.permlanex16.f64(double %undef, double %tidx_f64, i32 %src1, i32 %src2, i1 true, i1 true) - store double %v, ptr addrspace(1) %out - ret void -} - -define void @v_permlane16_half(ptr addrspace(1) %out, half %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlane16_half: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_readfirstlane_b32 s4, v3 -; GFX10-NEXT: v_readfirstlane_b32 s5, v4 -; GFX10-NEXT: v_permlane16_b32 v2, v2, s4, s5 -; GFX10-NEXT: global_store_short v[0:1], v2, off -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_permlane16_half: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_readfirstlane_b32 s0, v3 -; GFX11-NEXT: v_readfirstlane_b32 s1, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlane16_b32 v2, v2, s0, s1 -; GFX11-NEXT: global_store_b16 v[0:1], v2, off -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: v_permlane16_half: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_readfirstlane_b32 s0, v3 -; GFX12-NEXT: v_readfirstlane_b32 s1, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlane16_b32 v2, v2, s0, s1 -; GFX12-NEXT: global_store_b16 v[0:1], v2, off -; GFX12-NEXT: s_setpc_b64 s[30:31] - %v = call half @llvm.amdgcn.permlane16.f16(half %src0, half %src0, i32 %src1, i32 %src2, i1 false, i1 false) - store half %v, ptr addrspace(1) %out - ret void -} - -define void @v_permlanex16_half(ptr addrspace(1) %out, half %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlanex16_half: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_readfirstlane_b32 s4, v3 -; GFX10-NEXT: v_readfirstlane_b32 s5, v4 -; GFX10-NEXT: v_permlanex16_b32 v2, v2, s4, s5 -; GFX10-NEXT: global_store_short v[0:1], v2, off -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_permlanex16_half: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_readfirstlane_b32 s0, v3 -; GFX11-NEXT: v_readfirstlane_b32 s1, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v2, v2, s0, s1 -; GFX11-NEXT: global_store_b16 v[0:1], v2, off -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: v_permlanex16_half: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_readfirstlane_b32 s0, v3 -; GFX12-NEXT: v_readfirstlane_b32 s1, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlanex16_b32 v2, v2, s0, s1 -; GFX12-NEXT: global_store_b16 v[0:1], v2, off -; GFX12-NEXT: s_setpc_b64 s[30:31] - %v = call half @llvm.amdgcn.permlanex16.f16(half %src0, half %src0, i32 %src1, i32 %src2, i1 false, i1 false) - store half %v, ptr addrspace(1) %out - ret void -} - -define void @v_permlane16_bfloat(ptr addrspace(1) %out, bfloat %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlane16_bfloat: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_readfirstlane_b32 s4, v3 -; GFX10-NEXT: v_readfirstlane_b32 s5, v4 -; GFX10-NEXT: v_permlane16_b32 v2, v2, s4, s5 -; GFX10-NEXT: global_store_short v[0:1], v2, off -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_permlane16_bfloat: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_readfirstlane_b32 s0, v3 -; GFX11-NEXT: v_readfirstlane_b32 s1, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlane16_b32 v2, v2, s0, s1 -; GFX11-NEXT: global_store_b16 v[0:1], v2, off -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: v_permlane16_bfloat: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_readfirstlane_b32 s0, v3 -; GFX12-NEXT: v_readfirstlane_b32 s1, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlane16_b32 v2, v2, s0, s1 -; GFX12-NEXT: global_store_b16 v[0:1], v2, off -; GFX12-NEXT: s_setpc_b64 s[30:31] - %v = call bfloat @llvm.amdgcn.permlane16.f16(bfloat %src0, bfloat %src0, i32 %src1, i32 %src2, i1 false, i1 false) - store bfloat %v, ptr addrspace(1) %out - ret void -} - -define void @v_permlanex16_bfloat(ptr addrspace(1) %out, bfloat %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlanex16_bfloat: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_readfirstlane_b32 s4, v3 -; GFX10-NEXT: v_readfirstlane_b32 s5, v4 -; GFX10-NEXT: v_permlanex16_b32 v2, v2, s4, s5 -; GFX10-NEXT: global_store_short v[0:1], v2, off -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_permlanex16_bfloat: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_readfirstlane_b32 s0, v3 -; GFX11-NEXT: v_readfirstlane_b32 s1, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v2, v2, s0, s1 -; GFX11-NEXT: global_store_b16 v[0:1], v2, off -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: v_permlanex16_bfloat: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_readfirstlane_b32 s0, v3 -; GFX12-NEXT: v_readfirstlane_b32 s1, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlanex16_b32 v2, v2, s0, s1 -; GFX12-NEXT: global_store_b16 v[0:1], v2, off -; GFX12-NEXT: s_setpc_b64 s[30:31] - %v = call bfloat @llvm.amdgcn.permlanex16.f16(bfloat %src0, bfloat %src0, i32 %src1, i32 %src2, i1 false, i1 false) - store bfloat %v, ptr addrspace(1) %out - ret void -} - -define void @v_permlane16_i16(ptr addrspace(1) %out, i16 %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlane16_i16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_readfirstlane_b32 s4, v3 -; GFX10-NEXT: v_readfirstlane_b32 s5, v4 -; GFX10-NEXT: v_permlane16_b32 v2, v2, s4, s5 -; GFX10-NEXT: global_store_short v[0:1], v2, off -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_permlane16_i16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_readfirstlane_b32 s0, v3 -; GFX11-NEXT: v_readfirstlane_b32 s1, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlane16_b32 v2, v2, s0, s1 -; GFX11-NEXT: global_store_b16 v[0:1], v2, off -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: v_permlane16_i16: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_readfirstlane_b32 s0, v3 -; GFX12-NEXT: v_readfirstlane_b32 s1, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlane16_b32 v2, v2, s0, s1 -; GFX12-NEXT: global_store_b16 v[0:1], v2, off -; GFX12-NEXT: s_setpc_b64 s[30:31] - %v = call i16 @llvm.amdgcn.permlane16.i16(i16 %src0, i16 %src0, i32 %src1, i32 %src2, i1 false, i1 false) - store i16 %v, ptr addrspace(1) %out - ret void -} - -define void @v_permlanex16_i16(ptr addrspace(1) %out, i16 %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlanex16_i16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_readfirstlane_b32 s4, v3 -; GFX10-NEXT: v_readfirstlane_b32 s5, v4 -; GFX10-NEXT: v_permlanex16_b32 v2, v2, s4, s5 -; GFX10-NEXT: global_store_short v[0:1], v2, off -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_permlanex16_i16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_readfirstlane_b32 s0, v3 -; GFX11-NEXT: v_readfirstlane_b32 s1, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v2, v2, s0, s1 -; GFX11-NEXT: global_store_b16 v[0:1], v2, off -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: v_permlanex16_i16: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_readfirstlane_b32 s0, v3 -; GFX12-NEXT: v_readfirstlane_b32 s1, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlanex16_b32 v2, v2, s0, s1 -; GFX12-NEXT: global_store_b16 v[0:1], v2, off -; GFX12-NEXT: s_setpc_b64 s[30:31] - %v = call i16 @llvm.amdgcn.permlanex16.i16(i16 %src0, i16 %src0, i32 %src1, i32 %src2, i1 false, i1 false) - store i16 %v, ptr addrspace(1) %out - ret void -} - -define void @v_permlane16_v2f16(ptr addrspace(1) %out, <2 x half> %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlane16_v2f16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_readfirstlane_b32 s4, v3 -; GFX10-NEXT: v_readfirstlane_b32 s5, v4 -; GFX10-NEXT: v_permlane16_b32 v2, v2, s4, s5 -; GFX10-NEXT: global_store_dword v[0:1], v2, off -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_permlane16_v2f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_readfirstlane_b32 s0, v3 -; GFX11-NEXT: v_readfirstlane_b32 s1, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlane16_b32 v2, v2, s0, s1 -; GFX11-NEXT: global_store_b32 v[0:1], v2, off -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: v_permlane16_v2f16: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_readfirstlane_b32 s0, v3 -; GFX12-NEXT: v_readfirstlane_b32 s1, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlane16_b32 v2, v2, s0, s1 -; GFX12-NEXT: global_store_b32 v[0:1], v2, off -; GFX12-NEXT: s_setpc_b64 s[30:31] - %v = call <2 x half> @llvm.amdgcn.permlane16.v2f16(<2 x half> %src0, <2 x half> %src0, i32 %src1, i32 %src2, i1 false, i1 false) - store <2 x half> %v, ptr addrspace(1) %out - ret void -} - -define void @v_permlanex16_v2f16(ptr addrspace(1) %out, <2 x half> %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlanex16_v2f16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_readfirstlane_b32 s4, v3 -; GFX10-NEXT: v_readfirstlane_b32 s5, v4 -; GFX10-NEXT: v_permlanex16_b32 v2, v2, s4, s5 -; GFX10-NEXT: global_store_dword v[0:1], v2, off -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_permlanex16_v2f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_readfirstlane_b32 s0, v3 -; GFX11-NEXT: v_readfirstlane_b32 s1, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v2, v2, s0, s1 -; GFX11-NEXT: global_store_b32 v[0:1], v2, off -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: v_permlanex16_v2f16: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_readfirstlane_b32 s0, v3 -; GFX12-NEXT: v_readfirstlane_b32 s1, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlanex16_b32 v2, v2, s0, s1 -; GFX12-NEXT: global_store_b32 v[0:1], v2, off -; GFX12-NEXT: s_setpc_b64 s[30:31] - %v = call <2 x half> @llvm.amdgcn.permlanex16.v2f16(<2 x half> %src0, <2 x half> %src0, i32 %src1, i32 %src2, i1 false, i1 false) - store <2 x half> %v, ptr addrspace(1) %out - ret void -} - -define void @v_permlane16_v2f32(ptr addrspace(1) %out, <2 x float> %src0, i32 %src1, i32 %src2) { -; GFX10-SDAG-LABEL: v_permlane16_v2f32: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v4 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v5 -; GFX10-SDAG-NEXT: v_permlane16_b32 v3, v3, s4, s5 -; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v2, s4, s5 -; GFX10-SDAG-NEXT: global_store_dwordx2 v[0:1], v[2:3], off -; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-GISEL-LABEL: v_permlane16_v2f32: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v4 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v5 -; GFX10-GISEL-NEXT: v_permlane16_b32 v2, v2, s4, s5 -; GFX10-GISEL-NEXT: v_permlane16_b32 v3, v3, s4, s5 -; GFX10-GISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off -; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SDAG-LABEL: v_permlane16_v2f32: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v4 -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v5 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 -; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 -; GFX11-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: v_permlane16_v2f32: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v4 -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v5 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 -; GFX11-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1 -; GFX11-GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-SDAG-LABEL: v_permlane16_v2f32: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 -; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v4 -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v5 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 -; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 -; GFX12-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off -; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-GISEL-LABEL: v_permlane16_v2f32: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 -; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v4 -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v5 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 -; GFX12-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1 -; GFX12-GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off -; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] - %v = call <2 x float> @llvm.amdgcn.permlane16.v2f32(<2 x float> %src0, <2 x float> %src0, i32 %src1, i32 %src2, i1 false, i1 false) - store <2 x float> %v, ptr addrspace(1) %out - ret void -} - -define void @v_permlanex16_v2f32(ptr addrspace(1) %out, <2 x float> %src0, i32 %src1, i32 %src2) { -; GFX10-SDAG-LABEL: v_permlanex16_v2f32: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v4 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v5 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v3, v3, s4, s5 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v2, s4, s5 -; GFX10-SDAG-NEXT: global_store_dwordx2 v[0:1], v[2:3], off -; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-GISEL-LABEL: v_permlanex16_v2f32: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v4 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v5 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v2, v2, s4, s5 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v3, v3, s4, s5 -; GFX10-GISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off -; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SDAG-LABEL: v_permlanex16_v2f32: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v4 -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v5 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 -; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 -; GFX11-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: v_permlanex16_v2f32: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v4 -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v5 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 -; GFX11-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1 -; GFX11-GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-SDAG-LABEL: v_permlanex16_v2f32: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 -; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v4 -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v5 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 -; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 -; GFX12-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off -; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-GISEL-LABEL: v_permlanex16_v2f32: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 -; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v4 -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v5 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 -; GFX12-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1 -; GFX12-GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off -; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] - %v = call <2 x float> @llvm.amdgcn.permlanex16.v2f32(<2 x float> %src0, <2 x float> %src0, i32 %src1, i32 %src2, i1 false, i1 false) - store <2 x float> %v, ptr addrspace(1) %out - ret void -} - -define void @v_permlane16_v7i32(ptr addrspace(1) %out, <7 x i32> %src0, i32 %src1, i32 %src2) { -; GFX10-SDAG-LABEL: v_permlane16_v7i32: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v9 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v10 -; GFX10-SDAG-NEXT: v_permlane16_b32 v8, v8, s4, s5 -; GFX10-SDAG-NEXT: v_permlane16_b32 v7, v7, s4, s5 -; GFX10-SDAG-NEXT: v_permlane16_b32 v6, v6, s4, s5 -; GFX10-SDAG-NEXT: v_permlane16_b32 v5, v5, s4, s5 -; GFX10-SDAG-NEXT: v_permlane16_b32 v4, v4, s4, s5 -; GFX10-SDAG-NEXT: v_permlane16_b32 v3, v3, s4, s5 -; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v2, s4, s5 -; GFX10-SDAG-NEXT: global_store_dwordx3 v[0:1], v[6:8], off offset:16 -; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-GISEL-LABEL: v_permlane16_v7i32: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v9 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v10 -; GFX10-GISEL-NEXT: v_permlane16_b32 v2, v2, s4, s5 -; GFX10-GISEL-NEXT: v_permlane16_b32 v3, v3, s4, s5 -; GFX10-GISEL-NEXT: v_permlane16_b32 v4, v4, s4, s5 -; GFX10-GISEL-NEXT: v_permlane16_b32 v5, v5, s4, s5 -; GFX10-GISEL-NEXT: v_permlane16_b32 v6, v6, s4, s5 -; GFX10-GISEL-NEXT: v_permlane16_b32 v7, v7, s4, s5 -; GFX10-GISEL-NEXT: v_permlane16_b32 v8, v8, s4, s5 -; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; GFX10-GISEL-NEXT: global_store_dwordx3 v[0:1], v[6:8], off offset:16 -; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SDAG-LABEL: v_permlane16_v7i32: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v9 -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v10 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlane16_b32 v8, v8, s0, s1 -; GFX11-SDAG-NEXT: v_permlane16_b32 v7, v7, s0, s1 -; GFX11-SDAG-NEXT: v_permlane16_b32 v6, v6, s0, s1 -; GFX11-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1 -; GFX11-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1 -; GFX11-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 -; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 -; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16 -; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: v_permlane16_v7i32: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v9 -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v10 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 -; GFX11-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1 -; GFX11-GISEL-NEXT: v_permlane16_b32 v4, v4, s0, s1 -; GFX11-GISEL-NEXT: v_permlane16_b32 v5, v5, s0, s1 -; GFX11-GISEL-NEXT: v_permlane16_b32 v6, v6, s0, s1 -; GFX11-GISEL-NEXT: v_permlane16_b32 v7, v7, s0, s1 -; GFX11-GISEL-NEXT: v_permlane16_b32 v8, v8, s0, s1 -; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off -; GFX11-GISEL-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-SDAG-LABEL: v_permlane16_v7i32: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 -; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v9 -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v10 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_permlane16_b32 v8, v8, s0, s1 -; GFX12-SDAG-NEXT: v_permlane16_b32 v7, v7, s0, s1 -; GFX12-SDAG-NEXT: v_permlane16_b32 v6, v6, s0, s1 -; GFX12-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1 -; GFX12-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1 -; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 -; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16 -; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off -; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-GISEL-LABEL: v_permlane16_v7i32: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 -; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v9 -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v10 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 -; GFX12-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1 -; GFX12-GISEL-NEXT: v_permlane16_b32 v4, v4, s0, s1 -; GFX12-GISEL-NEXT: v_permlane16_b32 v5, v5, s0, s1 -; GFX12-GISEL-NEXT: v_permlane16_b32 v6, v6, s0, s1 -; GFX12-GISEL-NEXT: v_permlane16_b32 v7, v7, s0, s1 -; GFX12-GISEL-NEXT: v_permlane16_b32 v8, v8, s0, s1 -; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off -; GFX12-GISEL-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16 -; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] - %v = call <7 x i32> @llvm.amdgcn.permlane16.v7i32(<7 x i32> %src0, <7 x i32> %src0, i32 %src1, i32 %src2, i1 false, i1 false) - store <7 x i32> %v, ptr addrspace(1) %out - ret void -} - -define void @v_permlanex16_v7i32(ptr addrspace(1) %out, <7 x i32> %src0, i32 %src1, i32 %src2) { -; GFX10-SDAG-LABEL: v_permlanex16_v7i32: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v9 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v10 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v8, v8, s4, s5 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v7, v7, s4, s5 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v6, v6, s4, s5 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v5, v5, s4, s5 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v4, v4, s4, s5 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v3, v3, s4, s5 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v2, s4, s5 -; GFX10-SDAG-NEXT: global_store_dwordx3 v[0:1], v[6:8], off offset:16 -; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-GISEL-LABEL: v_permlanex16_v7i32: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v9 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v10 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v2, v2, s4, s5 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v3, v3, s4, s5 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v4, v4, s4, s5 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v5, v5, s4, s5 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v6, v6, s4, s5 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v7, v7, s4, s5 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v8, v8, s4, s5 -; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; GFX10-GISEL-NEXT: global_store_dwordx3 v[0:1], v[6:8], off offset:16 -; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SDAG-LABEL: v_permlanex16_v7i32: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v9 -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v10 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v8, v8, s0, s1 -; GFX11-SDAG-NEXT: v_permlanex16_b32 v7, v7, s0, s1 -; GFX11-SDAG-NEXT: v_permlanex16_b32 v6, v6, s0, s1 -; GFX11-SDAG-NEXT: v_permlanex16_b32 v5, v5, s0, s1 -; GFX11-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1 -; GFX11-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 -; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 -; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16 -; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: v_permlanex16_v7i32: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v9 -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v10 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 -; GFX11-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1 -; GFX11-GISEL-NEXT: v_permlanex16_b32 v4, v4, s0, s1 -; GFX11-GISEL-NEXT: v_permlanex16_b32 v5, v5, s0, s1 -; GFX11-GISEL-NEXT: v_permlanex16_b32 v6, v6, s0, s1 -; GFX11-GISEL-NEXT: v_permlanex16_b32 v7, v7, s0, s1 -; GFX11-GISEL-NEXT: v_permlanex16_b32 v8, v8, s0, s1 -; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off -; GFX11-GISEL-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-SDAG-LABEL: v_permlanex16_v7i32: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 -; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v9 -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v10 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v8, v8, s0, s1 -; GFX12-SDAG-NEXT: v_permlanex16_b32 v7, v7, s0, s1 -; GFX12-SDAG-NEXT: v_permlanex16_b32 v6, v6, s0, s1 -; GFX12-SDAG-NEXT: v_permlanex16_b32 v5, v5, s0, s1 -; GFX12-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1 -; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 -; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16 -; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off -; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-GISEL-LABEL: v_permlanex16_v7i32: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 -; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v9 -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v10 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 -; GFX12-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1 -; GFX12-GISEL-NEXT: v_permlanex16_b32 v4, v4, s0, s1 -; GFX12-GISEL-NEXT: v_permlanex16_b32 v5, v5, s0, s1 -; GFX12-GISEL-NEXT: v_permlanex16_b32 v6, v6, s0, s1 -; GFX12-GISEL-NEXT: v_permlanex16_b32 v7, v7, s0, s1 -; GFX12-GISEL-NEXT: v_permlanex16_b32 v8, v8, s0, s1 -; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off -; GFX12-GISEL-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16 -; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] - %v = call <7 x i32> @llvm.amdgcn.permlanex16.v7i32(<7 x i32> %src0, <7 x i32> %src0, i32 %src1, i32 %src2, i1 false, i1 false) - store <7 x i32> %v, ptr addrspace(1) %out - ret void -} - -define void @v_permlane16_v8i16(ptr addrspace(1) %out, <8 x i16> %src0, i32 %src1, i32 %src2) { -; GFX10-SDAG-LABEL: v_permlane16_v8i16: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v6 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v7 -; GFX10-SDAG-NEXT: v_permlane16_b32 v5, v5, s4, s5 -; GFX10-SDAG-NEXT: v_permlane16_b32 v4, v4, s4, s5 -; GFX10-SDAG-NEXT: v_permlane16_b32 v3, v3, s4, s5 -; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v2, s4, s5 -; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-GISEL-LABEL: v_permlane16_v8i16: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v6 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v7 -; GFX10-GISEL-NEXT: v_permlane16_b32 v2, v2, s4, s5 -; GFX10-GISEL-NEXT: v_permlane16_b32 v3, v3, s4, s5 -; GFX10-GISEL-NEXT: v_permlane16_b32 v4, v4, s4, s5 -; GFX10-GISEL-NEXT: v_permlane16_b32 v5, v5, s4, s5 -; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SDAG-LABEL: v_permlane16_v8i16: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v6 -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v7 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1 -; GFX11-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1 -; GFX11-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 -; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 -; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: v_permlane16_v8i16: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v6 -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v7 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 -; GFX11-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1 -; GFX11-GISEL-NEXT: v_permlane16_b32 v4, v4, s0, s1 -; GFX11-GISEL-NEXT: v_permlane16_b32 v5, v5, s0, s1 -; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-SDAG-LABEL: v_permlane16_v8i16: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 -; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v6 -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v7 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1 -; GFX12-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1 -; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 -; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 -; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off -; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-GISEL-LABEL: v_permlane16_v8i16: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 -; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v6 -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v7 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 -; GFX12-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1 -; GFX12-GISEL-NEXT: v_permlane16_b32 v4, v4, s0, s1 -; GFX12-GISEL-NEXT: v_permlane16_b32 v5, v5, s0, s1 -; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off -; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] - %v = call <8 x i16> @llvm.amdgcn.permlane16.v8i16(<8 x i16> %src0, <8 x i16> %src0, i32 %src1, i32 %src2, i1 false, i1 false) - store <8 x i16> %v, ptr addrspace(1) %out - ret void -} - -define void @v_permlanex16_v8i16(ptr addrspace(1) %out, <8 x i16> %src0, i32 %src1, i32 %src2) { -; GFX10-SDAG-LABEL: v_permlanex16_v8i16: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v6 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v7 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v5, v5, s4, s5 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v4, v4, s4, s5 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v3, v3, s4, s5 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v2, s4, s5 -; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-GISEL-LABEL: v_permlanex16_v8i16: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v6 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v7 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v2, v2, s4, s5 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v3, v3, s4, s5 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v4, v4, s4, s5 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v5, v5, s4, s5 -; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SDAG-LABEL: v_permlanex16_v8i16: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v6 -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v7 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v5, v5, s0, s1 -; GFX11-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1 -; GFX11-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 -; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 -; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: v_permlanex16_v8i16: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v6 -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v7 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 -; GFX11-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1 -; GFX11-GISEL-NEXT: v_permlanex16_b32 v4, v4, s0, s1 -; GFX11-GISEL-NEXT: v_permlanex16_b32 v5, v5, s0, s1 -; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-SDAG-LABEL: v_permlanex16_v8i16: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 -; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v6 -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v7 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v5, v5, s0, s1 -; GFX12-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1 -; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 -; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 -; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off -; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-GISEL-LABEL: v_permlanex16_v8i16: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 -; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v6 -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v7 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 -; GFX12-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1 -; GFX12-GISEL-NEXT: v_permlanex16_b32 v4, v4, s0, s1 -; GFX12-GISEL-NEXT: v_permlanex16_b32 v5, v5, s0, s1 -; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off -; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] - %v = call <8 x i16> @llvm.amdgcn.permlanex16.v8i16(<8 x i16> %src0, <8 x i16> %src0, i32 %src1, i32 %src2, i1 false, i1 false) - store <8 x i16> %v, ptr addrspace(1) %out + %undef = freeze i32 poison + %v = call i32 @llvm.amdgcn.permlanex16(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 true, i1 true) + store i32 %v, ptr addrspace(1) %out ret void } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll index a65143255bbb4..973678291e263 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll @@ -10,7 +10,7 @@ declare i32 @llvm.amdgcn.workitem.id.y() define amdgpu_kernel void @v_permlane16var_b32_vv(ptr addrspace(1) %out, i32 %src0, i32 %src1) { ; GFX12-SDAG-LABEL: v_permlane16var_b32_vv: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -23,7 +23,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vv(ptr addrspace(1) %out, i32 %sr ; ; GFX12-GISEL-LABEL: v_permlane16var_b32_vv: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -41,7 +41,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vv(ptr addrspace(1) %out, i32 %sr define amdgpu_kernel void @v_permlane16var_b32_vi(ptr addrspace(1) %out, i32 %src0) { ; GFX12-SDAG-LABEL: v_permlane16var_b32_vi: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 1 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 @@ -54,7 +54,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vi(ptr addrspace(1) %out, i32 %sr ; ; GFX12-GISEL-LABEL: v_permlane16var_b32_vi: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -72,7 +72,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vi(ptr addrspace(1) %out, i32 %sr define amdgpu_kernel void @v_permlane16var_b32_vl(ptr addrspace(1) %out, i32 %src0) { ; GFX12-SDAG-LABEL: v_permlane16var_b32_vl: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0xc1d1 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 @@ -85,7 +85,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vl(ptr addrspace(1) %out, i32 %sr ; ; GFX12-GISEL-LABEL: v_permlane16var_b32_vl: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, 0xc1d1 :: v_dual_mov_b32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -103,8 +103,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vl(ptr addrspace(1) %out, i32 %sr define amdgpu_kernel void @v_permlane16var_b32_vvv(ptr addrspace(1) %out, i32 %src0) { ; GFX12-SDAG-LABEL: v_permlane16var_b32_vvv: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -116,9 +115,9 @@ define amdgpu_kernel void @v_permlane16var_b32_vvv(ptr addrspace(1) %out, i32 %s ; ; GFX12-GISEL-LABEL: v_permlane16var_b32_vvv: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v1, v1, v0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -135,7 +134,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vvv(ptr addrspace(1) %out, i32 %s define amdgpu_kernel void @v_permlane16var_b32_vv_fi(ptr addrspace(1) %out, i32 %src0, i32 %src1) { ; GFX12-SDAG-LABEL: v_permlane16var_b32_vv_fi: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -148,7 +147,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vv_fi(ptr addrspace(1) %out, i32 ; ; GFX12-GISEL-LABEL: v_permlane16var_b32_vv_fi: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -166,7 +165,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vv_fi(ptr addrspace(1) %out, i32 define amdgpu_kernel void @v_permlane16var_b32_vv_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1) { ; GFX12-SDAG-LABEL: v_permlane16var_b32_vv_bc: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -179,7 +178,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vv_bc(ptr addrspace(1) %out, i32 ; ; GFX12-GISEL-LABEL: v_permlane16var_b32_vv_bc: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -197,7 +196,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vv_bc(ptr addrspace(1) %out, i32 define amdgpu_kernel void @v_permlane16var_b32_vv_fi_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1) { ; GFX12-SDAG-LABEL: v_permlane16var_b32_vv_fi_bc: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -210,7 +209,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vv_fi_bc(ptr addrspace(1) %out, i ; ; GFX12-GISEL-LABEL: v_permlane16var_b32_vv_fi_bc: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -228,7 +227,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vv_fi_bc(ptr addrspace(1) %out, i define amdgpu_kernel void @v_permlanex16var_b32_vv(ptr addrspace(1) %out, i32 %src0, i32 %src1) { ; GFX12-SDAG-LABEL: v_permlanex16var_b32_vv: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -241,7 +240,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv(ptr addrspace(1) %out, i32 %s ; ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vv: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -259,7 +258,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv(ptr addrspace(1) %out, i32 %s define amdgpu_kernel void @v_permlanex16var_b32_vi(ptr addrspace(1) %out, i32 %src0) { ; GFX12-SDAG-LABEL: v_permlanex16var_b32_vi: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 1 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 @@ -272,7 +271,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vi(ptr addrspace(1) %out, i32 %s ; ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vi: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -290,7 +289,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vi(ptr addrspace(1) %out, i32 %s define amdgpu_kernel void @v_permlanex16var_b32_vl(ptr addrspace(1) %out, i32 %src0) { ; GFX12-SDAG-LABEL: v_permlanex16var_b32_vl: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0xc1d1 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 @@ -303,7 +302,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vl(ptr addrspace(1) %out, i32 %s ; ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vl: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, 0xc1d1 :: v_dual_mov_b32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -321,8 +320,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vl(ptr addrspace(1) %out, i32 %s define amdgpu_kernel void @v_permlanex16var_b32_vvv(ptr addrspace(1) %out, i32 %src0) { ; GFX12-SDAG-LABEL: v_permlanex16var_b32_vvv: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -334,9 +332,9 @@ define amdgpu_kernel void @v_permlanex16var_b32_vvv(ptr addrspace(1) %out, i32 % ; ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vvv: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v1, v1, v0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -353,7 +351,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vvv(ptr addrspace(1) %out, i32 % define amdgpu_kernel void @v_permlanex16var_b32_vv_fi(ptr addrspace(1) %out, i32 %src0, i32 %src1) { ; GFX12-SDAG-LABEL: v_permlanex16var_b32_vv_fi: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -366,7 +364,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv_fi(ptr addrspace(1) %out, i32 ; ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vv_fi: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -384,7 +382,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv_fi(ptr addrspace(1) %out, i32 define amdgpu_kernel void @v_permlanex16var_b32_vv_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1) { ; GFX12-SDAG-LABEL: v_permlanex16var_b32_vv_bc: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -397,7 +395,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv_bc(ptr addrspace(1) %out, i32 ; ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vv_bc: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -415,7 +413,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv_bc(ptr addrspace(1) %out, i32 define amdgpu_kernel void @v_permlanex16var_b32_vv_fi_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1) { ; GFX12-SDAG-LABEL: v_permlanex16var_b32_vv_fi_bc: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -428,7 +426,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv_fi_bc(ptr addrspace(1) %out, ; ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vv_fi_bc: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -447,11 +445,10 @@ define amdgpu_kernel void @v_permlane16var_b32_tid_tid(ptr addrspace(1) %out, i3 ; GFX12-SDAG-LABEL: v_permlane16var_b32_tid_tid: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] @@ -462,10 +459,10 @@ define amdgpu_kernel void @v_permlane16var_b32_tid_tid(ptr addrspace(1) %out, i3 ; GFX12-GISEL-LABEL: v_permlane16var_b32_tid_tid: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -483,11 +480,10 @@ define amdgpu_kernel void @v_permlane16var_b32_undef_tid(ptr addrspace(1) %out, ; GFX12-SDAG-LABEL: v_permlane16var_b32_undef_tid: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] @@ -498,10 +494,10 @@ define amdgpu_kernel void @v_permlane16var_b32_undef_tid(ptr addrspace(1) %out, ; GFX12-GISEL-LABEL: v_permlane16var_b32_undef_tid: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -520,11 +516,11 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid(ptr addrspace(1) %out, i32 ; GFX12-SDAG-LABEL: v_permlane16var_b32_i_tid: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0x3039 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, s4 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v2, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_var_b32 v1, v0, v2 ; GFX12-SDAG-NEXT: global_store_b32 v3, v1, s[0:1] @@ -535,12 +531,10 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid(ptr addrspace(1) %out, i32 ; GFX12-GISEL-LABEL: v_permlane16var_b32_i_tid: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v1, v0, v2 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -558,11 +552,10 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_fi(ptr addrspace(1) %out, i ; GFX12-SDAG-LABEL: v_permlane16var_b32_i_tid_fi: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[1,0] ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] @@ -573,10 +566,10 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_fi(ptr addrspace(1) %out, i ; GFX12-GISEL-LABEL: v_permlane16var_b32_i_tid_fi: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[1,0] ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -595,11 +588,10 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_bc(ptr addrspace(1) %out, i ; GFX12-SDAG-LABEL: v_permlane16var_b32_i_tid_bc: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[0,1] ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] @@ -610,10 +602,10 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_bc(ptr addrspace(1) %out, i ; GFX12-GISEL-LABEL: v_permlane16var_b32_i_tid_bc: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[0,1] ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -632,11 +624,10 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_fi_bc(ptr addrspace(1) %out ; GFX12-SDAG-LABEL: v_permlane16var_b32_i_tid_fi_bc: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[1,1] ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] @@ -647,10 +638,10 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_fi_bc(ptr addrspace(1) %out ; GFX12-GISEL-LABEL: v_permlane16var_b32_i_tid_fi_bc: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[1,1] ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -669,11 +660,10 @@ define amdgpu_kernel void @v_permlanex16var_b32_tid_tid(ptr addrspace(1) %out, i ; GFX12-SDAG-LABEL: v_permlanex16var_b32_tid_tid: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] @@ -684,10 +674,10 @@ define amdgpu_kernel void @v_permlanex16var_b32_tid_tid(ptr addrspace(1) %out, i ; GFX12-GISEL-LABEL: v_permlanex16var_b32_tid_tid: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -705,11 +695,10 @@ define amdgpu_kernel void @v_permlanex16var_b32_undef_tid(ptr addrspace(1) %out, ; GFX12-SDAG-LABEL: v_permlanex16var_b32_undef_tid: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] @@ -720,10 +709,10 @@ define amdgpu_kernel void @v_permlanex16var_b32_undef_tid(ptr addrspace(1) %out, ; GFX12-GISEL-LABEL: v_permlanex16var_b32_undef_tid: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -742,11 +731,11 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid(ptr addrspace(1) %out, i32 ; GFX12-SDAG-LABEL: v_permlanex16var_b32_i_tid: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0x3039 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, s4 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v2, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v1, v0, v2 ; GFX12-SDAG-NEXT: global_store_b32 v3, v1, s[0:1] @@ -757,12 +746,10 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid(ptr addrspace(1) %out, i32 ; GFX12-GISEL-LABEL: v_permlanex16var_b32_i_tid: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v1, v0, v2 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -780,11 +767,10 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_fi(ptr addrspace(1) %out, ; GFX12-SDAG-LABEL: v_permlanex16var_b32_i_tid_fi: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,0] ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] @@ -795,10 +781,10 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_fi(ptr addrspace(1) %out, ; GFX12-GISEL-LABEL: v_permlanex16var_b32_i_tid_fi: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,0] ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -817,11 +803,10 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_bc(ptr addrspace(1) %out, ; GFX12-SDAG-LABEL: v_permlanex16var_b32_i_tid_bc: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[0,1] ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] @@ -832,10 +817,10 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_bc(ptr addrspace(1) %out, ; GFX12-GISEL-LABEL: v_permlanex16var_b32_i_tid_bc: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[0,1] ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -854,11 +839,10 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_fi_bc(ptr addrspace(1) %ou ; GFX12-SDAG-LABEL: v_permlanex16var_b32_i_tid_fi_bc: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,1] ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] @@ -869,10 +853,10 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_fi_bc(ptr addrspace(1) %ou ; GFX12-GISEL-LABEL: v_permlanex16var_b32_i_tid_fi_bc: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,1] ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll index d521a6c25e462..47c021769aa56 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll @@ -10,104 +10,102 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 { ; VARIANT0-LABEL: test_barrier: ; VARIANT0: ; %bb.0: ; %entry -; VARIANT0-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; VARIANT0-NEXT: s_load_dword s4, s[2:3], 0xb -; VARIANT0-NEXT: s_mov_b32 s3, 0xf000 -; VARIANT0-NEXT: s_mov_b32 s2, 0 +; VARIANT0-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; VARIANT0-NEXT: s_load_dword s0, s[0:1], 0xb +; VARIANT0-NEXT: s_mov_b32 s7, 0xf000 +; VARIANT0-NEXT: s_mov_b32 s6, 0 ; VARIANT0-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VARIANT0-NEXT: v_mov_b32_e32 v2, 0 ; VARIANT0-NEXT: v_not_b32_e32 v3, v0 ; VARIANT0-NEXT: s_waitcnt lgkmcnt(0) -; VARIANT0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 +; VARIANT0-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 ; VARIANT0-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VARIANT0-NEXT: s_barrier -; VARIANT0-NEXT: v_add_i32_e32 v3, vcc, s4, v3 +; VARIANT0-NEXT: v_add_i32_e32 v3, vcc, s0, v3 ; VARIANT0-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; VARIANT0-NEXT: v_lshl_b64 v[3:4], v[3:4], 2 -; VARIANT0-NEXT: buffer_load_dword v0, v[3:4], s[0:3], 0 addr64 +; VARIANT0-NEXT: buffer_load_dword v0, v[3:4], s[4:7], 0 addr64 ; VARIANT0-NEXT: s_waitcnt vmcnt(0) -; VARIANT0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 +; VARIANT0-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 ; VARIANT0-NEXT: s_endpgm ; ; VARIANT1-LABEL: test_barrier: ; VARIANT1: ; %bb.0: ; %entry -; VARIANT1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; VARIANT1-NEXT: s_load_dword s4, s[2:3], 0xb -; VARIANT1-NEXT: s_mov_b32 s3, 0xf000 -; VARIANT1-NEXT: s_mov_b32 s2, 0 +; VARIANT1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; VARIANT1-NEXT: s_load_dword s0, s[0:1], 0xb +; VARIANT1-NEXT: s_mov_b32 s7, 0xf000 +; VARIANT1-NEXT: s_mov_b32 s6, 0 ; VARIANT1-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VARIANT1-NEXT: v_mov_b32_e32 v2, 0 ; VARIANT1-NEXT: v_not_b32_e32 v3, v0 ; VARIANT1-NEXT: s_waitcnt lgkmcnt(0) -; VARIANT1-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 +; VARIANT1-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 ; VARIANT1-NEXT: s_barrier -; VARIANT1-NEXT: v_add_i32_e32 v3, vcc, s4, v3 +; VARIANT1-NEXT: v_add_i32_e32 v3, vcc, s0, v3 ; VARIANT1-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; VARIANT1-NEXT: v_lshl_b64 v[3:4], v[3:4], 2 ; VARIANT1-NEXT: s_waitcnt expcnt(0) -; VARIANT1-NEXT: buffer_load_dword v0, v[3:4], s[0:3], 0 addr64 +; VARIANT1-NEXT: buffer_load_dword v0, v[3:4], s[4:7], 0 addr64 ; VARIANT1-NEXT: s_waitcnt vmcnt(0) -; VARIANT1-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 +; VARIANT1-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 ; VARIANT1-NEXT: s_endpgm ; ; VARIANT2-LABEL: test_barrier: ; VARIANT2: ; %bb.0: ; %entry -; VARIANT2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VARIANT2-NEXT: s_load_dword s4, s[2:3], 0x2c +; VARIANT2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VARIANT2-NEXT: s_load_dword s4, s[0:1], 0x2c ; VARIANT2-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VARIANT2-NEXT: s_waitcnt lgkmcnt(0) -; VARIANT2-NEXT: global_store_dword v2, v0, s[0:1] +; VARIANT2-NEXT: global_store_dword v2, v0, s[2:3] ; VARIANT2-NEXT: v_xad_u32 v0, v0, -1, s4 ; VARIANT2-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VARIANT2-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] -; VARIANT2-NEXT: v_mov_b32_e32 v3, s1 -; VARIANT2-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; VARIANT2-NEXT: v_mov_b32_e32 v3, s3 +; VARIANT2-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 ; VARIANT2-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc ; VARIANT2-NEXT: s_waitcnt vmcnt(0) ; VARIANT2-NEXT: s_barrier ; VARIANT2-NEXT: global_load_dword v0, v[0:1], off ; VARIANT2-NEXT: s_waitcnt vmcnt(0) -; VARIANT2-NEXT: global_store_dword v2, v0, s[0:1] +; VARIANT2-NEXT: global_store_dword v2, v0, s[2:3] ; VARIANT2-NEXT: s_endpgm ; ; VARIANT3-LABEL: test_barrier: ; VARIANT3: ; %bb.0: ; %entry -; VARIANT3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VARIANT3-NEXT: s_load_dword s4, s[2:3], 0x2c +; VARIANT3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VARIANT3-NEXT: s_load_dword s4, s[0:1], 0x2c ; VARIANT3-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VARIANT3-NEXT: s_waitcnt lgkmcnt(0) -; VARIANT3-NEXT: global_store_dword v2, v0, s[0:1] +; VARIANT3-NEXT: global_store_dword v2, v0, s[2:3] ; VARIANT3-NEXT: v_xad_u32 v0, v0, -1, s4 ; VARIANT3-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VARIANT3-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] -; VARIANT3-NEXT: v_mov_b32_e32 v3, s1 -; VARIANT3-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; VARIANT3-NEXT: v_mov_b32_e32 v3, s3 +; VARIANT3-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 ; VARIANT3-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc ; VARIANT3-NEXT: s_barrier ; VARIANT3-NEXT: global_load_dword v0, v[0:1], off ; VARIANT3-NEXT: s_waitcnt vmcnt(0) -; VARIANT3-NEXT: global_store_dword v2, v0, s[0:1] +; VARIANT3-NEXT: global_store_dword v2, v0, s[2:3] ; VARIANT3-NEXT: s_endpgm ; ; VARIANT4-LABEL: test_barrier: ; VARIANT4: ; %bb.0: ; %entry -; VARIANT4-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; VARIANT4-NEXT: v_and_b32_e32 v2, 0x3ff, v0 -; VARIANT4-NEXT: s_delay_alu instid0(VALU_DEP_1) -; VARIANT4-NEXT: v_lshlrev_b32_e32 v3, 2, v2 +; VARIANT4-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; VARIANT4-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; VARIANT4-NEXT: s_wait_kmcnt 0x0 -; VARIANT4-NEXT: v_xad_u32 v0, v2, -1, s2 -; VARIANT4-NEXT: global_store_b32 v3, v2, s[0:1] +; VARIANT4-NEXT: v_xad_u32 v1, v0, -1, s2 +; VARIANT4-NEXT: global_store_b32 v3, v0, s[0:1] ; VARIANT4-NEXT: s_wait_storecnt 0x0 ; VARIANT4-NEXT: s_barrier_signal -1 ; VARIANT4-NEXT: s_barrier_wait -1 -; VARIANT4-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; VARIANT4-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; VARIANT4-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; VARIANT4-NEXT: v_lshlrev_b64_e32 v[0:1], 2, v[0:1] -; VARIANT4-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0 +; VARIANT4-NEXT: v_lshlrev_b64_e32 v[1:2], 2, v[1:2] +; VARIANT4-NEXT: v_add_co_u32 v1, vcc_lo, s0, v1 ; VARIANT4-NEXT: s_delay_alu instid0(VALU_DEP_2) -; VARIANT4-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo -; VARIANT4-NEXT: global_load_b32 v0, v[0:1], off +; VARIANT4-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, s1, v2, vcc_lo +; VARIANT4-NEXT: global_load_b32 v0, v[1:2], off ; VARIANT4-NEXT: s_wait_loadcnt 0x0 ; VARIANT4-NEXT: global_store_b32 v3, v0, s[0:1] ; VARIANT4-NEXT: s_nop 0 @@ -116,22 +114,20 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 { ; ; VARIANT5-LABEL: test_barrier: ; VARIANT5: ; %bb.0: ; %entry -; VARIANT5-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; VARIANT5-NEXT: v_and_b32_e32 v2, 0x3ff, v0 -; VARIANT5-NEXT: s_delay_alu instid0(VALU_DEP_1) -; VARIANT5-NEXT: v_lshlrev_b32_e32 v3, 2, v2 +; VARIANT5-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; VARIANT5-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; VARIANT5-NEXT: s_wait_kmcnt 0x0 -; VARIANT5-NEXT: v_xad_u32 v0, v2, -1, s2 -; VARIANT5-NEXT: global_store_b32 v3, v2, s[0:1] +; VARIANT5-NEXT: v_xad_u32 v1, v0, -1, s2 +; VARIANT5-NEXT: global_store_b32 v3, v0, s[0:1] ; VARIANT5-NEXT: s_barrier_signal -1 ; VARIANT5-NEXT: s_barrier_wait -1 -; VARIANT5-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; VARIANT5-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; VARIANT5-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; VARIANT5-NEXT: v_lshlrev_b64_e32 v[0:1], 2, v[0:1] -; VARIANT5-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0 +; VARIANT5-NEXT: v_lshlrev_b64_e32 v[1:2], 2, v[1:2] +; VARIANT5-NEXT: v_add_co_u32 v1, vcc_lo, s0, v1 ; VARIANT5-NEXT: s_delay_alu instid0(VALU_DEP_2) -; VARIANT5-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo -; VARIANT5-NEXT: global_load_b32 v0, v[0:1], off +; VARIANT5-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, s1, v2, vcc_lo +; VARIANT5-NEXT: global_load_b32 v0, v[1:2], off ; VARIANT5-NEXT: s_wait_loadcnt 0x0 ; VARIANT5-NEXT: global_store_b32 v3, v0, s[0:1] ; VARIANT5-NEXT: s_nop 0 @@ -140,24 +136,23 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 { ; ; VARIANT6-LABEL: test_barrier: ; VARIANT6: ; %bb.0: ; %entry -; VARIANT6-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; VARIANT6-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; VARIANT6-NEXT: v_lshlrev_b32_e32 v5, 2, v0 ; VARIANT6-NEXT: s_wait_kmcnt 0x0 -; VARIANT6-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_and_b32 v4, 0x3ff, v0 ; VARIANT6-NEXT: s_sub_co_i32 s2, s2, 1 -; VARIANT6-NEXT: s_delay_alu instid0(VALU_DEP_1) -; VARIANT6-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_lshlrev_b32 v5, 2, v4 -; VARIANT6-NEXT: v_sub_nc_u32_e32 v0, s2, v4 -; VARIANT6-NEXT: global_store_b32 v5, v4, s[0:1] -; VARIANT6-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; VARIANT6-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0 +; VARIANT6-NEXT: v_sub_nc_u32_e32 v1, s2, v0 +; VARIANT6-NEXT: global_store_b32 v5, v0, s[0:1] ; VARIANT6-NEXT: s_wait_storecnt 0x0 ; VARIANT6-NEXT: s_barrier_signal -1 ; VARIANT6-NEXT: s_barrier_wait -1 +; VARIANT6-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; VARIANT6-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; VARIANT6-NEXT: v_lshlrev_b64_e32 v[0:1], 2, v[0:1] -; VARIANT6-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 +; VARIANT6-NEXT: v_lshlrev_b64_e32 v[1:2], 2, v[1:2] +; VARIANT6-NEXT: v_add_co_u32 v1, vcc_lo, v3, v1 ; VARIANT6-NEXT: s_delay_alu instid0(VALU_DEP_2) -; VARIANT6-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo -; VARIANT6-NEXT: global_load_b32 v0, v[0:1], off +; VARIANT6-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v4, v2, vcc_lo +; VARIANT6-NEXT: global_load_b32 v0, v[1:2], off ; VARIANT6-NEXT: s_wait_loadcnt 0x0 ; VARIANT6-NEXT: global_store_b32 v5, v0, s[0:1] ; VARIANT6-NEXT: s_nop 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll index 8bfe996c6a90a..38a34ec6daf73 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll @@ -5,11 +5,10 @@ define amdgpu_kernel void @test1_s_barrier_signal(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test1_s_barrier_signal: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 +; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v3, v2, s[0:1] @@ -23,11 +22,10 @@ define amdgpu_kernel void @test1_s_barrier_signal(ptr addrspace(1) %out) #0 { ; ; GLOBAL-ISEL-LABEL: test1_s_barrier_signal: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] @@ -53,11 +51,10 @@ entry: define amdgpu_kernel void @test2_s_barrier_signal(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test2_s_barrier_signal: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 +; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v3, v2, s[0:1] @@ -71,11 +68,10 @@ define amdgpu_kernel void @test2_s_barrier_signal(ptr addrspace(1) %out) #0 { ; ; GLOBAL-ISEL-LABEL: test2_s_barrier_signal: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] @@ -101,11 +97,10 @@ entry: define amdgpu_kernel void @test3_s_barrier_signal(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test3_s_barrier_signal: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 +; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v3, v2, s[0:1] @@ -119,11 +114,10 @@ define amdgpu_kernel void @test3_s_barrier_signal(ptr addrspace(1) %out) #0 { ; ; GLOBAL-ISEL-LABEL: test3_s_barrier_signal: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] @@ -149,12 +143,12 @@ entry: define amdgpu_kernel void @test1_s_barrier_signal_var(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test1_s_barrier_signal_var: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GCN-NEXT: s_mov_b32 m0, 1 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mul_u32_u24_e32 v2, v0, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 2, v0 +; GCN-NEXT: s_mov_b32 m0, 1 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v3, v1, s[0:1] @@ -168,13 +162,11 @@ define amdgpu_kernel void @test1_s_barrier_signal_var(ptr addrspace(1) %out) #0 ; ; GLOBAL-ISEL-LABEL: test1_s_barrier_signal_var: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v2, 0 -; GLOBAL-ISEL-NEXT: s_mov_b32 m0, 1 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_lshlrev_b32_e32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_mov_b32 m0, 1 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] @@ -230,10 +222,8 @@ define void @test2_s_barrier_signal_var(i32 %arg) { define amdgpu_kernel void @test1_s_barrier_signal_isfirst(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 { ; GCN-LABEL: test1_s_barrier_signal_isfirst: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v0, v1, s[6:7] ; GCN-NEXT: s_wait_storecnt 0x0 @@ -252,10 +242,8 @@ define amdgpu_kernel void @test1_s_barrier_signal_isfirst(ptr addrspace(1) %a, p ; ; GLOBAL-ISEL-LABEL: test1_s_barrier_signal_isfirst: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7] ; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0 @@ -290,10 +278,8 @@ entry: define amdgpu_kernel void @test2_s_barrier_signal_isfirst(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 { ; GCN-LABEL: test2_s_barrier_signal_isfirst: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v0, v1, s[6:7] ; GCN-NEXT: s_wait_storecnt 0x0 @@ -312,10 +298,8 @@ define amdgpu_kernel void @test2_s_barrier_signal_isfirst(ptr addrspace(1) %a, p ; ; GLOBAL-ISEL-LABEL: test2_s_barrier_signal_isfirst: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7] ; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0 @@ -350,10 +334,8 @@ entry: define amdgpu_kernel void @test3_s_barrier_signal_isfirst(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 { ; GCN-LABEL: test3_s_barrier_signal_isfirst: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v0, v1, s[6:7] ; GCN-NEXT: s_wait_storecnt 0x0 @@ -372,10 +354,8 @@ define amdgpu_kernel void @test3_s_barrier_signal_isfirst(ptr addrspace(1) %a, p ; ; GLOBAL-ISEL-LABEL: test3_s_barrier_signal_isfirst: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7] ; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0 @@ -410,11 +390,9 @@ entry: define amdgpu_kernel void @test1_s_barrier_signal_isfirst_var(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 { ; GCN-LABEL: test1_s_barrier_signal_isfirst_var: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GCN-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GCN-NEXT: s_mov_b32 m0, 1 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v0, v1, s[6:7] ; GCN-NEXT: s_wait_storecnt 0x0 @@ -433,11 +411,9 @@ define amdgpu_kernel void @test1_s_barrier_signal_isfirst_var(ptr addrspace(1) % ; ; GLOBAL-ISEL-LABEL: test1_s_barrier_signal_isfirst_var: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GLOBAL-ISEL-NEXT: s_mov_b32 m0, 1 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7] ; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0 @@ -542,11 +518,10 @@ define void @test2_s_barrier_signal_isfirst_var(ptr addrspace(1) %a, ptr addrspa define amdgpu_kernel void @test1_s_barrier_init(ptr addrspace(1) %out, i32 %mbrCnt) #0 { ; GCN-LABEL: test1_s_barrier_init: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GCN-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 +; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_lshl_b32 s2, s2, 16 @@ -560,11 +535,10 @@ define amdgpu_kernel void @test1_s_barrier_init(ptr addrspace(1) %out, i32 %mbrC ; ; GLOBAL-ISEL-LABEL: test1_s_barrier_init: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: s_lshl_b32 m0, 16, s2 @@ -588,11 +562,10 @@ entry: define amdgpu_kernel void @test2_s_barrier_init(ptr addrspace(1) %out, i32 %mbrCnt) #0 { ; GCN-LABEL: test2_s_barrier_init: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GCN-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 +; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_lshl_b32 s2, s2, 16 @@ -606,11 +579,10 @@ define amdgpu_kernel void @test2_s_barrier_init(ptr addrspace(1) %out, i32 %mbrC ; ; GLOBAL-ISEL-LABEL: test2_s_barrier_init: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: s_lshl_b32 m0, 16, s2 @@ -634,11 +606,10 @@ entry: define amdgpu_kernel void @test3_s_barrier_init(ptr addrspace(1) %out, i32 %mbrCnt) #0 { ; GCN-LABEL: test3_s_barrier_init: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GCN-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 +; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_lshl_b32 s2, s2, 16 @@ -652,11 +623,10 @@ define amdgpu_kernel void @test3_s_barrier_init(ptr addrspace(1) %out, i32 %mbrC ; ; GLOBAL-ISEL-LABEL: test3_s_barrier_init: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: s_lshl_b32 m0, 16, s2 @@ -680,17 +650,15 @@ entry: define amdgpu_kernel void @test4_s_barrier_init(ptr addrspace(1) %out, i32 %bar, i32 %mbrCnt) #0 { ; GCN-LABEL: test4_s_barrier_init: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GCN-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 +; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_lshl_b32 s3, s3, 16 ; GCN-NEXT: global_store_b32 v3, v2, s[0:1] ; GCN-NEXT: s_or_b32 s2, s2, s3 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GCN-NEXT: s_mov_b32 m0, s2 ; GCN-NEXT: s_barrier_init m0 ; GCN-NEXT: global_store_b32 v3, v0, s[0:1] @@ -700,11 +668,10 @@ define amdgpu_kernel void @test4_s_barrier_init(ptr addrspace(1) %out, i32 %bar, ; ; GLOBAL-ISEL-LABEL: test4_s_barrier_init: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: s_lshl_b32 s3, 16, s3 @@ -765,12 +732,11 @@ define void @test5_s_barrier_init_m0(i32 %arg1 ,i32 %arg2) { define amdgpu_kernel void @test1_s_barrier_join(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test1_s_barrier_join: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-NEXT: s_barrier_join -1 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GCN-NEXT: s_barrier_join -1 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v2, v0, s[0:1] @@ -780,11 +746,10 @@ define amdgpu_kernel void @test1_s_barrier_join(ptr addrspace(1) %out) #0 { ; ; GLOBAL-ISEL-LABEL: test1_s_barrier_join: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] @@ -807,12 +772,11 @@ entry: define amdgpu_kernel void @test2_s_barrier_join(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test2_s_barrier_join: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-NEXT: s_barrier_join 1 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GCN-NEXT: s_barrier_join 1 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v2, v0, s[0:1] @@ -822,11 +786,10 @@ define amdgpu_kernel void @test2_s_barrier_join(ptr addrspace(1) %out) #0 { ; ; GLOBAL-ISEL-LABEL: test2_s_barrier_join: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] @@ -849,12 +812,11 @@ entry: define amdgpu_kernel void @test3_s_barrier_join(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test3_s_barrier_join: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-NEXT: s_barrier_join 0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GCN-NEXT: s_barrier_join 0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v2, v0, s[0:1] @@ -864,11 +826,10 @@ define amdgpu_kernel void @test3_s_barrier_join(ptr addrspace(1) %out) #0 { ; ; GLOBAL-ISEL-LABEL: test3_s_barrier_join: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] @@ -891,11 +852,11 @@ entry: define amdgpu_kernel void @test4_s_barrier_join_m0(ptr addrspace(1) %out, i32 %bar) #0 { ; GCN-LABEL: test4_s_barrier_join_m0: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GCN-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GCN-NEXT: v_mul_u32_u24_e32 v2, v0, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 2, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_mov_b32 m0, s2 @@ -908,11 +869,10 @@ define amdgpu_kernel void @test4_s_barrier_join_m0(ptr addrspace(1) %out, i32 %b ; ; GLOBAL-ISEL-LABEL: test4_s_barrier_join_m0: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: s_mov_b32 m0, s2 @@ -964,10 +924,8 @@ define void @test5_s_barrier_join_m0(i32 %arg) { define amdgpu_kernel void @test1_s_barrier_leave(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 { ; GCN-LABEL: test1_s_barrier_leave: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v0, v1, s[6:7] ; GCN-NEXT: s_barrier_leave @@ -985,16 +943,14 @@ define amdgpu_kernel void @test1_s_barrier_leave(ptr addrspace(1) %a, ptr addrsp ; ; GLOBAL-ISEL-LABEL: test1_s_barrier_leave: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) -; GLOBAL-ISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7] ; GLOBAL-ISEL-NEXT: s_barrier_leave ; GLOBAL-ISEL-NEXT: s_cselect_b32 s8, 1, 0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GLOBAL-ISEL-NEXT: s_and_b32 s8, s8, 1 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GLOBAL-ISEL-NEXT: s_cmp_lg_u32 s8, 0 ; GLOBAL-ISEL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] ; GLOBAL-ISEL-NEXT: s_clause 0x1 @@ -1022,12 +978,11 @@ entry: define amdgpu_kernel void @test1_s_wakeup_barrier(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test1_s_wakeup_barrier: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-NEXT: s_wakeup_barrier -1 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GCN-NEXT: s_wakeup_barrier -1 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v2, v0, s[0:1] @@ -1037,11 +992,10 @@ define amdgpu_kernel void @test1_s_wakeup_barrier(ptr addrspace(1) %out) #0 { ; ; GLOBAL-ISEL-LABEL: test1_s_wakeup_barrier: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] @@ -1064,12 +1018,11 @@ entry: define amdgpu_kernel void @test2_s_wakeup_barrier(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test2_s_wakeup_barrier: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-NEXT: s_wakeup_barrier 1 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GCN-NEXT: s_wakeup_barrier 1 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v2, v0, s[0:1] @@ -1079,11 +1032,10 @@ define amdgpu_kernel void @test2_s_wakeup_barrier(ptr addrspace(1) %out) #0 { ; ; GLOBAL-ISEL-LABEL: test2_s_wakeup_barrier: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] @@ -1106,12 +1058,11 @@ entry: define amdgpu_kernel void @test3_s_wakeup_barrier(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test3_s_wakeup_barrier: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-NEXT: s_wakeup_barrier 0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GCN-NEXT: s_wakeup_barrier 0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v2, v0, s[0:1] @@ -1121,11 +1072,10 @@ define amdgpu_kernel void @test3_s_wakeup_barrier(ptr addrspace(1) %out) #0 { ; ; GLOBAL-ISEL-LABEL: test3_s_wakeup_barrier: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] @@ -1148,11 +1098,11 @@ entry: define amdgpu_kernel void @test4_s_wakeup_barrier_m0(ptr addrspace(1) %out, i32 %bar) #0 { ; GCN-LABEL: test4_s_wakeup_barrier_m0: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GCN-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GCN-NEXT: v_mul_u32_u24_e32 v2, v0, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 2, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_mov_b32 m0, s2 @@ -1165,11 +1115,10 @@ define amdgpu_kernel void @test4_s_wakeup_barrier_m0(ptr addrspace(1) %out, i32 ; ; GLOBAL-ISEL-LABEL: test4_s_wakeup_barrier_m0: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: s_mov_b32 m0, s2 @@ -1221,12 +1170,11 @@ define void @test5_s_wakeup_barrier_m0(i32 %arg) { define amdgpu_kernel void @test1_s_get_barrier_state(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test1_s_get_barrier_state: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_get_barrier_state s4, -1 -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_get_barrier_state s2, -1 +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GCN-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_lshlrev_b32 v0, 2, v0 ; GCN-NEXT: global_store_b32 v0, v1, s[0:1] ; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1234,14 +1182,13 @@ define amdgpu_kernel void @test1_s_get_barrier_state(ptr addrspace(1) %out) #0 { ; ; GLOBAL-ISEL-LABEL: test1_s_get_barrier_state: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_2) -; GLOBAL-ISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GLOBAL-ISEL-NEXT: s_get_barrier_state s2, -1 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2) ; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v1, s2 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GLOBAL-ISEL-NEXT: s_nop 0 @@ -1259,12 +1206,11 @@ entry: define amdgpu_kernel void @test2_s_get_barrier_state(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test2_s_get_barrier_state: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_get_barrier_state s4, 1 -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_get_barrier_state s2, 1 +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GCN-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_lshlrev_b32 v0, 2, v0 ; GCN-NEXT: global_store_b32 v0, v1, s[0:1] ; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1272,14 +1218,13 @@ define amdgpu_kernel void @test2_s_get_barrier_state(ptr addrspace(1) %out) #0 { ; ; GLOBAL-ISEL-LABEL: test2_s_get_barrier_state: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_2) -; GLOBAL-ISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GLOBAL-ISEL-NEXT: s_get_barrier_state s2, 1 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2) ; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v1, s2 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GLOBAL-ISEL-NEXT: s_nop 0 @@ -1297,12 +1242,11 @@ entry: define amdgpu_kernel void @test3_s_get_barrier_state(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test3_s_get_barrier_state: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_get_barrier_state s4, 0 -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_get_barrier_state s2, 0 +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GCN-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_lshlrev_b32 v0, 2, v0 ; GCN-NEXT: global_store_b32 v0, v1, s[0:1] ; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1310,14 +1254,13 @@ define amdgpu_kernel void @test3_s_get_barrier_state(ptr addrspace(1) %out) #0 { ; ; GLOBAL-ISEL-LABEL: test3_s_get_barrier_state: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_2) -; GLOBAL-ISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GLOBAL-ISEL-NEXT: s_get_barrier_state s2, 0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2) ; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v1, s2 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GLOBAL-ISEL-NEXT: s_nop 0 @@ -1335,10 +1278,8 @@ entry: define amdgpu_kernel void @test4_s_get_barrier_state_m0(ptr addrspace(1) %out, i32 %bar) #0 { ; GCN-LABEL: test4_s_get_barrier_state_m0: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_mov_b32 m0, s2 ; GCN-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1353,10 +1294,8 @@ define amdgpu_kernel void @test4_s_get_barrier_state_m0(ptr addrspace(1) %out, i ; ; GLOBAL-ISEL-LABEL: test4_s_get_barrier_state_m0: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: s_mov_b32 m0, s2 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1413,11 +1352,10 @@ define i32 @test5_s_get_barrier_state_m0(i32 %arg) { define amdgpu_kernel void @test_barrier_convert(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test_barrier_convert: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 +; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v3, v2, s[0:1] @@ -1431,11 +1369,10 @@ define amdgpu_kernel void @test_barrier_convert(ptr addrspace(1) %out) #0 { ; ; GLOBAL-ISEL-LABEL: test_barrier_convert: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll index 527627a5a2f67..c2e74eb05d164 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll @@ -5,11 +5,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_cluster(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_WMMA_cluster: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 5, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GCN-NEXT: v_and_b32_e32 v40, 0x7fe0, v0 +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GCN-NEXT: v_lshlrev_b32_e32 v40, 5, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GCN-NEXT: v_add_nc_u32_e32 v32, s0, v40 ; GCN-NEXT: v_dual_mov_b32 v81, s1 :: v_dual_add_nc_u32 v80, s1, v40 ; GCN-NEXT: ds_load_b128 v[4:7], v32 offset:16 @@ -73,11 +72,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_cluster(ptr ad ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_WMMA_cluster: ; EXACTCUTOFF: ; %bb.0: ; %entry -; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 5, v0 -; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; EXACTCUTOFF-NEXT: v_and_b32_e32 v40, 0x7fe0, v0 +; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v40, 5, v0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) +; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) ; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v32, s0, v40 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v81, s1 :: v_dual_add_nc_u32 v80, s1, v40 ; EXACTCUTOFF-NEXT: ds_load_b128 v[4:7], v32 offset:16 @@ -177,11 +175,10 @@ entry: define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_interleave(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_WMMA_interleave: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 5, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GCN-NEXT: v_and_b32_e32 v16, 0x7fe0, v0 +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 5, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GCN-NEXT: v_add_nc_u32_e32 v17, s0, v16 ; GCN-NEXT: v_add_nc_u32_e32 v16, s1, v16 ; GCN-NEXT: ds_load_b128 v[4:7], v17 offset:16 @@ -259,11 +256,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_interleave(ptr ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_WMMA_interleave: ; EXACTCUTOFF: ; %bb.0: ; %entry -; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 5, v0 -; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; EXACTCUTOFF-NEXT: v_and_b32_e32 v16, 0x7fe0, v0 +; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v16, 5, v0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) +; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) ; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v17, s0, v16 ; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v16, s1, v16 ; EXACTCUTOFF-NEXT: ds_load_b128 v[4:7], v17 offset:16 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll index a29e2298210a3..fdcb1773d0a3f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll @@ -7,12 +7,11 @@ declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16..i16( define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_cluster(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_SWMMAC_cluster: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GCN-NEXT: v_lshlrev_b32_e32 v28, 4, v0 ; GCN-NEXT: v_mov_b32_e32 v48, 0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GCN-NEXT: v_and_b32_e32 v28, 0x3ff0, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_add_nc_u32_e32 v0, s0, v28 ; GCN-NEXT: v_dual_mov_b32 v50, s1 :: v_dual_add_nc_u32 v49, s1, v28 ; GCN-NEXT: ds_load_b128 v[8:11], v0 @@ -59,12 +58,11 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_cluster(ptr ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_SWMMAC_cluster: ; EXACTCUTOFF: ; %bb.0: ; %entry -; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v28, 4, v0 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v48, 0 -; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; EXACTCUTOFF-NEXT: v_and_b32_e32 v28, 0x3ff0, v0 ; EXACTCUTOFF-NEXT: s_wait_kmcnt 0x0 +; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_2) ; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v0, s0, v28 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v50, s1 :: v_dual_add_nc_u32 v49, s1, v28 ; EXACTCUTOFF-NEXT: ds_load_b128 v[8:11], v0 @@ -149,131 +147,127 @@ entry: define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_interleaved(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_SWMMAC_interleaved: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_and_b32_e32 v16, 0x3ff, v0 +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v18, 0 ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GCN-NEXT: v_lshl_add_u32 v17, v16, 5, s0 -; GCN-NEXT: v_lshl_add_u32 v16, v16, 4, s1 -; GCN-NEXT: ds_load_b128 v[8:11], v17 offset:1024 -; GCN-NEXT: ds_load_b128 v[0:3], v17 -; GCN-NEXT: ds_load_b128 v[4:7], v17 offset:16 +; GCN-NEXT: v_lshl_add_u32 v17, v0, 5, s0 +; GCN-NEXT: v_lshl_add_u32 v0, v0, 4, s1 +; GCN-NEXT: ds_load_b128 v[9:12], v17 offset:1024 +; GCN-NEXT: ds_load_b128 v[1:4], v17 +; GCN-NEXT: ds_load_b128 v[5:8], v17 offset:16 ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(3) SyncID(0) ; GCN-NEXT: s_wait_dscnt 0x2 -; GCN-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 -; GCN-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 +; GCN-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 +; GCN-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 ; GCN-NEXT: s_wait_dscnt 0x0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 +; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-NEXT: ds_store_b128 v16, v[12:15] -; GCN-NEXT: ds_load_b128 v[8:11], v17 offset:2560 -; GCN-NEXT: v_mov_b32_e32 v16, s1 +; GCN-NEXT: ds_store_b128 v0, v[13:16] +; GCN-NEXT: ds_load_b128 v[9:12], v17 offset:2560 +; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) ; GCN-NEXT: s_wait_dscnt 0x0 -; GCN-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 -; GCN-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 +; GCN-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 +; GCN-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 +; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-NEXT: ds_store_b128 v16, v[12:15] offset:512 -; GCN-NEXT: ds_load_b128 v[8:11], v17 offset:4608 +; GCN-NEXT: ds_store_b128 v0, v[13:16] offset:512 +; GCN-NEXT: ds_load_b128 v[9:12], v17 offset:4608 ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) ; GCN-NEXT: s_wait_dscnt 0x0 -; GCN-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 -; GCN-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 +; GCN-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 +; GCN-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 +; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-NEXT: ds_store_b128 v16, v[12:15] offset:1024 -; GCN-NEXT: ds_load_b128 v[8:11], v17 offset:7168 +; GCN-NEXT: ds_store_b128 v0, v[13:16] offset:1024 +; GCN-NEXT: ds_load_b128 v[9:12], v17 offset:7168 ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) ; GCN-NEXT: s_wait_dscnt 0x0 -; GCN-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 -; GCN-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 +; GCN-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 +; GCN-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 +; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-NEXT: ds_store_b128 v16, v[12:15] offset:1536 -; GCN-NEXT: ds_load_b128 v[8:11], v17 offset:10240 +; GCN-NEXT: ds_store_b128 v0, v[13:16] offset:1536 +; GCN-NEXT: ds_load_b128 v[9:12], v17 offset:10240 ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) ; GCN-NEXT: s_wait_dscnt 0x0 -; GCN-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 -; GCN-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 +; GCN-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 +; GCN-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 +; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-NEXT: ds_store_b128 v16, v[12:15] offset:2048 +; GCN-NEXT: ds_store_b128 v0, v[13:16] offset:2048 ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) ; GCN-NEXT: s_endpgm ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_SWMMAC_interleaved: ; EXACTCUTOFF: ; %bb.0: ; %entry -; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; EXACTCUTOFF-NEXT: v_and_b32_e32 v16, 0x3ff, v0 +; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v18, 0 ; EXACTCUTOFF-NEXT: s_wait_kmcnt 0x0 -; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_2) -; EXACTCUTOFF-NEXT: v_lshl_add_u32 v17, v16, 5, s0 -; EXACTCUTOFF-NEXT: v_lshl_add_u32 v16, v16, 4, s1 -; EXACTCUTOFF-NEXT: ds_load_b128 v[8:11], v17 offset:1024 -; EXACTCUTOFF-NEXT: ds_load_b128 v[0:3], v17 -; EXACTCUTOFF-NEXT: ds_load_b128 v[4:7], v17 offset:16 +; EXACTCUTOFF-NEXT: v_lshl_add_u32 v17, v0, 5, s0 +; EXACTCUTOFF-NEXT: v_lshl_add_u32 v0, v0, 4, s1 +; EXACTCUTOFF-NEXT: ds_load_b128 v[9:12], v17 offset:1024 +; EXACTCUTOFF-NEXT: ds_load_b128 v[1:4], v17 +; EXACTCUTOFF-NEXT: ds_load_b128 v[5:8], v17 offset:16 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(3) SyncID(0) ; EXACTCUTOFF-NEXT: s_wait_dscnt 0x2 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 ; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0 ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) -; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 +; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[12:15] -; EXACTCUTOFF-NEXT: ds_load_b128 v[8:11], v17 offset:2560 -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v16, s1 +; EXACTCUTOFF-NEXT: ds_store_b128 v0, v[13:16] +; EXACTCUTOFF-NEXT: ds_load_b128 v[9:12], v17 offset:2560 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v0, s1 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) -; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 +; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[12:15] offset:512 -; EXACTCUTOFF-NEXT: ds_load_b128 v[8:11], v17 offset:4608 +; EXACTCUTOFF-NEXT: ds_store_b128 v0, v[13:16] offset:512 +; EXACTCUTOFF-NEXT: ds_load_b128 v[9:12], v17 offset:4608 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) -; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 +; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[12:15] offset:1024 -; EXACTCUTOFF-NEXT: ds_load_b128 v[8:11], v17 offset:7168 +; EXACTCUTOFF-NEXT: ds_store_b128 v0, v[13:16] offset:1024 +; EXACTCUTOFF-NEXT: ds_load_b128 v[9:12], v17 offset:7168 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) -; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 +; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[12:15] offset:1536 -; EXACTCUTOFF-NEXT: ds_load_b128 v[8:11], v17 offset:10240 +; EXACTCUTOFF-NEXT: ds_store_b128 v0, v[13:16] offset:1536 +; EXACTCUTOFF-NEXT: ds_load_b128 v[9:12], v17 offset:10240 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) -; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 +; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[12:15] offset:2048 +; EXACTCUTOFF-NEXT: ds_store_b128 v0, v[13:16] offset:2048 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll index 24b8a3c2dc873..ae5b62ffb285b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll @@ -29,19 +29,18 @@ entry: define amdgpu_kernel void @test_sched_group_barrier_pipeline_READ_VALU_WRITE(ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_READ_VALU_WRITE: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v32, 7, v0 -; GCN-NEXT: ; kill: killed $sgpr0_sgpr1 +; GCN-NEXT: ; kill: killed $sgpr4_sgpr5 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] -; GCN-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16 -; GCN-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32 -; GCN-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48 -; GCN-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64 -; GCN-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80 -; GCN-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96 -; GCN-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112 +; GCN-NEXT: global_load_dwordx4 v[0:3], v32, s[4:5] +; GCN-NEXT: global_load_dwordx4 v[4:7], v32, s[4:5] offset:16 +; GCN-NEXT: global_load_dwordx4 v[8:11], v32, s[4:5] offset:32 +; GCN-NEXT: global_load_dwordx4 v[12:15], v32, s[4:5] offset:48 +; GCN-NEXT: global_load_dwordx4 v[16:19], v32, s[4:5] offset:64 +; GCN-NEXT: global_load_dwordx4 v[20:23], v32, s[4:5] offset:80 +; GCN-NEXT: global_load_dwordx4 v[24:27], v32, s[4:5] offset:96 +; GCN-NEXT: global_load_dwordx4 v[28:31], v32, s[4:5] offset:112 ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(8) SyncID(0) ; GCN-NEXT: s_waitcnt vmcnt(7) ; GCN-NEXT: v_mul_lo_u32 v3, v3, v3 @@ -83,33 +82,32 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_READ_VALU_WRITE(ptr ; GCN-NEXT: v_mul_lo_u32 v30, v30, v30 ; GCN-NEXT: v_mul_lo_u32 v29, v29, v29 ; GCN-NEXT: v_mul_lo_u32 v28, v28, v28 -; GCN-NEXT: global_store_dwordx4 v32, v[28:31], s[2:3] offset:112 -; GCN-NEXT: global_store_dwordx4 v32, v[24:27], s[2:3] offset:96 -; GCN-NEXT: global_store_dwordx4 v32, v[20:23], s[2:3] offset:80 -; GCN-NEXT: global_store_dwordx4 v32, v[16:19], s[2:3] offset:64 -; GCN-NEXT: global_store_dwordx4 v32, v[12:15], s[2:3] offset:48 -; GCN-NEXT: global_store_dwordx4 v32, v[8:11], s[2:3] offset:32 -; GCN-NEXT: global_store_dwordx4 v32, v[4:7], s[2:3] offset:16 -; GCN-NEXT: global_store_dwordx4 v32, v[0:3], s[2:3] +; GCN-NEXT: global_store_dwordx4 v32, v[28:31], s[6:7] offset:112 +; GCN-NEXT: global_store_dwordx4 v32, v[24:27], s[6:7] offset:96 +; GCN-NEXT: global_store_dwordx4 v32, v[20:23], s[6:7] offset:80 +; GCN-NEXT: global_store_dwordx4 v32, v[16:19], s[6:7] offset:64 +; GCN-NEXT: global_store_dwordx4 v32, v[12:15], s[6:7] offset:48 +; GCN-NEXT: global_store_dwordx4 v32, v[8:11], s[6:7] offset:32 +; GCN-NEXT: global_store_dwordx4 v32, v[4:7], s[6:7] offset:16 +; GCN-NEXT: global_store_dwordx4 v32, v[0:3], s[6:7] ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(30) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(8) SyncID(0) ; GCN-NEXT: s_endpgm ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_READ_VALU_WRITE: ; EXACTCUTOFF: ; %bb.0: -; EXACTCUTOFF-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; EXACTCUTOFF-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v32, 7, v0 -; EXACTCUTOFF-NEXT: ; kill: killed $sgpr0_sgpr1 +; EXACTCUTOFF-NEXT: ; kill: killed $sgpr4_sgpr5 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v32, s[4:5] +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[4:7], v32, s[4:5] offset:16 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[8:11], v32, s[4:5] offset:32 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[12:15], v32, s[4:5] offset:48 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[16:19], v32, s[4:5] offset:64 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[20:23], v32, s[4:5] offset:80 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[24:27], v32, s[4:5] offset:96 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[28:31], v32, s[4:5] offset:112 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(8) SyncID(0) ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(7) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v3, v3, v3 @@ -151,14 +149,14 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_READ_VALU_WRITE(ptr ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v30, v30, v30 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v29, v29, v29 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v28, v28, v28 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[28:31], s[2:3] offset:112 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[24:27], s[2:3] offset:96 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[20:23], s[2:3] offset:80 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[16:19], s[2:3] offset:64 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[12:15], s[2:3] offset:48 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[8:11], s[2:3] offset:32 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[4:7], s[2:3] offset:16 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[0:3], s[2:3] +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[28:31], s[6:7] offset:112 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[24:27], s[6:7] offset:96 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[20:23], s[6:7] offset:80 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[16:19], s[6:7] offset:64 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[12:15], s[6:7] offset:48 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[8:11], s[6:7] offset:32 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[4:7], s[6:7] offset:16 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[0:3], s[6:7] ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(30) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(8) SyncID(0) ; EXACTCUTOFF-NEXT: s_endpgm @@ -180,27 +178,34 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_READ_VALU_WRITE(ptr define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VALU(ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v32, 7, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:16 -; GCN-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] -; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; GCN-NEXT: global_load_dwordx4 v[28:31], v32, s[4:5] offset:16 +; GCN-NEXT: global_load_dwordx4 v[8:11], v32, s[4:5] offset:96 ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_mul_lo_u32 v29, v29, v29 ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_lo_u32 v9, v9, v9 +; GCN-NEXT: global_load_dwordx4 v[0:3], v32, s[4:5] +; GCN-NEXT: v_mul_lo_u32 v8, v8, v8 +; GCN-NEXT: v_mul_lo_u32 v28, v28, v28 +; GCN-NEXT: v_mul_lo_u32 v31, v31, v31 +; GCN-NEXT: v_mul_lo_u32 v30, v30, v30 +; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_lo_u32 v3, v3, v3 ; GCN-NEXT: v_mul_lo_u32 v2, v2, v2 -; GCN-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:112 +; GCN-NEXT: global_load_dwordx4 v[4:7], v32, s[4:5] offset:112 ; GCN-NEXT: v_mul_lo_u32 v1, v1, v1 ; GCN-NEXT: v_mul_lo_u32 v0, v0, v0 -; GCN-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:96 -; GCN-NEXT: v_mul_lo_u32 v28, v28, v28 -; GCN-NEXT: v_mul_lo_u32 v31, v31, v31 -; GCN-NEXT: v_mul_lo_u32 v30, v30, v30 +; GCN-NEXT: v_mul_lo_u32 v11, v11, v11 +; GCN-NEXT: v_mul_lo_u32 v10, v10, v10 +; GCN-NEXT: global_load_dwordx4 v[12:15], v32, s[4:5] offset:48 +; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) @@ -208,33 +213,24 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_mul_lo_u32 v7, v7, v7 ; GCN-NEXT: v_mul_lo_u32 v6, v6, v6 -; GCN-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:80 ; GCN-NEXT: v_mul_lo_u32 v5, v5, v5 ; GCN-NEXT: v_mul_lo_u32 v4, v4, v4 -; GCN-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:48 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_mul_lo_u32 v11, v11, v11 -; GCN-NEXT: v_mul_lo_u32 v10, v10, v10 -; GCN-NEXT: v_mul_lo_u32 v9, v9, v9 -; GCN-NEXT: v_mul_lo_u32 v8, v8, v8 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_lo_u32 v13, v13, v13 +; GCN-NEXT: v_mul_lo_u32 v15, v15, v15 +; GCN-NEXT: global_load_dwordx4 v[16:19], v32, s[4:5] offset:80 +; GCN-NEXT: v_mul_lo_u32 v14, v14, v14 +; GCN-NEXT: v_mul_lo_u32 v12, v12, v12 +; GCN-NEXT: global_load_dwordx4 v[20:23], v32, s[4:5] offset:64 +; GCN-NEXT: global_load_dwordx4 v[24:27], v32, s[4:5] offset:32 ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_lo_u32 v15, v15, v15 -; GCN-NEXT: v_mul_lo_u32 v14, v14, v14 -; GCN-NEXT: v_mul_lo_u32 v13, v13, v13 -; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_waitcnt vmcnt(2) ; GCN-NEXT: v_mul_lo_u32 v19, v19, v19 ; GCN-NEXT: v_mul_lo_u32 v18, v18, v18 -; GCN-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:64 -; GCN-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:32 ; GCN-NEXT: v_mul_lo_u32 v17, v17, v17 -; GCN-NEXT: v_mul_lo_u32 v16, v16, v16 -; GCN-NEXT: v_mul_lo_u32 v12, v12, v12 -; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_mul_lo_u32 v23, v23, v23 ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -245,14 +241,15 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; GCN-NEXT: v_mul_lo_u32 v22, v22, v22 ; GCN-NEXT: v_mul_lo_u32 v21, v21, v21 ; GCN-NEXT: v_mul_lo_u32 v20, v20, v20 -; GCN-NEXT: global_store_dwordx4 v32, v[4:7], s[2:3] offset:112 -; GCN-NEXT: global_store_dwordx4 v32, v[8:11], s[2:3] offset:96 -; GCN-NEXT: global_store_dwordx4 v32, v[12:15], s[2:3] offset:80 -; GCN-NEXT: global_store_dwordx4 v32, v[20:23], s[2:3] offset:64 -; GCN-NEXT: global_store_dwordx4 v32, v[16:19], s[2:3] offset:48 -; GCN-NEXT: global_store_dwordx4 v32, v[24:27], s[2:3] offset:32 -; GCN-NEXT: global_store_dwordx4 v32, v[28:31], s[2:3] offset:16 -; GCN-NEXT: global_store_dwordx4 v32, v[0:3], s[2:3] +; GCN-NEXT: v_mul_lo_u32 v16, v16, v16 +; GCN-NEXT: global_store_dwordx4 v32, v[4:7], s[6:7] offset:112 +; GCN-NEXT: global_store_dwordx4 v32, v[8:11], s[6:7] offset:96 +; GCN-NEXT: global_store_dwordx4 v32, v[16:19], s[6:7] offset:80 +; GCN-NEXT: global_store_dwordx4 v32, v[20:23], s[6:7] offset:64 +; GCN-NEXT: global_store_dwordx4 v32, v[12:15], s[6:7] offset:48 +; GCN-NEXT: global_store_dwordx4 v32, v[24:27], s[6:7] offset:32 +; GCN-NEXT: global_store_dwordx4 v32, v[28:31], s[6:7] offset:16 +; GCN-NEXT: global_store_dwordx4 v32, v[0:3], s[6:7] ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) @@ -261,27 +258,34 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU: ; EXACTCUTOFF: ; %bb.0: -; EXACTCUTOFF-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; EXACTCUTOFF-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v32, 7, v0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:16 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[28:31], v32, s[4:5] offset:16 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[8:11], v32, s[4:5] offset:96 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(1) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v29, v29, v29 ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v9, v9, v9 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v32, s[4:5] +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v8, v8, v8 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v28, v28, v28 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v31, v31, v31 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v30, v30, v30 +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v3, v3, v3 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v2, v2, v2 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:112 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[4:7], v32, s[4:5] offset:112 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v1, v1, v1 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v0, v0, v0 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:96 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v28, v28, v28 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v31, v31, v31 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v30, v30, v30 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v11, v11, v11 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v10, v10, v10 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[12:15], v32, s[4:5] offset:48 +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) @@ -289,33 +293,24 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(1) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v7, v7, v7 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v6, v6, v6 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:80 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v5, v5, v5 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v4, v4, v4 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:48 -; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(2) -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v11, v11, v11 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v10, v10, v10 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v9, v9, v9 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v8, v8, v8 +; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v13, v13, v13 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v15, v15, v15 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[16:19], v32, s[4:5] offset:80 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v14, v14, v14 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v12, v12, v12 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[20:23], v32, s[4:5] offset:64 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[24:27], v32, s[4:5] offset:32 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(1) -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v15, v15, v15 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v14, v14, v14 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v13, v13, v13 -; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) +; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(2) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v19, v19, v19 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v18, v18, v18 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:64 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:32 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v17, v17, v17 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v16, v16, v16 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v12, v12, v12 -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(1) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v23, v23, v23 ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) @@ -326,14 +321,15 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v22, v22, v22 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v21, v21, v21 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v20, v20, v20 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[4:7], s[2:3] offset:112 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[8:11], s[2:3] offset:96 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[12:15], s[2:3] offset:80 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[20:23], s[2:3] offset:64 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[16:19], s[2:3] offset:48 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[24:27], s[2:3] offset:32 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[28:31], s[2:3] offset:16 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[0:3], s[2:3] +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v16, v16, v16 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[4:7], s[6:7] offset:112 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[8:11], s[6:7] offset:96 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[16:19], s[6:7] offset:80 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[20:23], s[6:7] offset:64 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[12:15], s[6:7] offset:48 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[24:27], s[6:7] offset:32 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[28:31], s[6:7] offset:16 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[0:3], s[6:7] ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) @@ -385,55 +381,47 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VALU_WRITE(ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU_WRITE: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v16, 7, v0 -; GCN-NEXT: ; kill: killed $sgpr0_sgpr1 +; GCN-NEXT: ; kill: killed $sgpr4_sgpr5 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:32 +; GCN-NEXT: global_load_dwordx4 v[12:15], v16, s[4:5] offset:32 +; GCN-NEXT: global_load_dwordx4 v[4:7], v16, s[4:5] offset:48 ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_mul_lo_u32 v13, v13, v13 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_lo_u32 v7, v7, v7 +; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[4:5] +; GCN-NEXT: v_mul_lo_u32 v6, v6, v6 ; GCN-NEXT: v_mul_lo_u32 v12, v12, v12 ; GCN-NEXT: v_mul_lo_u32 v15, v15, v15 ; GCN-NEXT: v_mul_lo_u32 v14, v14, v14 -; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:32 -; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] -; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) -; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_lo_u32 v3, v3, v3 -; GCN-NEXT: v_mul_lo_u32 v2, v2, v2 -; GCN-NEXT: v_mul_lo_u32 v1, v1, v1 -; GCN-NEXT: v_mul_lo_u32 v0, v0, v0 -; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] -; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:112 ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_lo_u32 v3, v3, v3 ; GCN-NEXT: v_mul_lo_u32 v2, v2, v2 ; GCN-NEXT: v_mul_lo_u32 v1, v1, v1 ; GCN-NEXT: v_mul_lo_u32 v0, v0, v0 -; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:112 -; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:96 +; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] +; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[4:5] offset:112 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_lo_u32 v3, v3, v3 ; GCN-NEXT: v_mul_lo_u32 v2, v2, v2 ; GCN-NEXT: v_mul_lo_u32 v1, v1, v1 ; GCN-NEXT: v_mul_lo_u32 v0, v0, v0 -; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:96 -; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:80 +; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] offset:112 +; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[4:5] offset:96 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_lo_u32 v3, v3, v3 ; GCN-NEXT: v_mul_lo_u32 v2, v2, v2 ; GCN-NEXT: v_mul_lo_u32 v1, v1, v1 ; GCN-NEXT: v_mul_lo_u32 v0, v0, v0 -; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:80 -; GCN-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:48 -; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) -; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] offset:96 +; GCN-NEXT: v_mul_lo_u32 v5, v5, v5 +; GCN-NEXT: v_mul_lo_u32 v4, v4, v4 +; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:48 +; GCN-NEXT: global_load_dwordx4 v[4:7], v16, s[4:5] offset:64 ; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) @@ -442,87 +430,86 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; GCN-NEXT: v_mul_lo_u32 v6, v6, v6 ; GCN-NEXT: v_mul_lo_u32 v5, v5, v5 ; GCN-NEXT: v_mul_lo_u32 v4, v4, v4 -; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:48 -; GCN-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:16 +; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:64 +; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:32 +; GCN-NEXT: global_load_dwordx4 v[8:11], v16, s[4:5] offset:16 ; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_lo_u32 v9, v9, v9 -; GCN-NEXT: v_mul_lo_u32 v8, v8, v8 -; GCN-NEXT: v_mul_lo_u32 v11, v11, v11 -; GCN-NEXT: v_mul_lo_u32 v10, v10, v10 -; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:16 -; GCN-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:64 ; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_lo_u32 v9, v9, v9 +; GCN-NEXT: v_mul_lo_u32 v8, v8, v8 +; GCN-NEXT: v_mul_lo_u32 v11, v11, v11 +; GCN-NEXT: v_mul_lo_u32 v10, v10, v10 +; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:16 +; GCN-NEXT: global_load_dwordx4 v[8:11], v16, s[4:5] offset:80 +; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_lo_u32 v11, v11, v11 ; GCN-NEXT: v_mul_lo_u32 v10, v10, v10 ; GCN-NEXT: v_mul_lo_u32 v9, v9, v9 ; GCN-NEXT: v_mul_lo_u32 v8, v8, v8 -; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:64 +; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:80 +; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; GCN-NEXT: s_endpgm ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU_WRITE: ; EXACTCUTOFF: ; %bb.0: -; EXACTCUTOFF-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; EXACTCUTOFF-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v16, 7, v0 -; EXACTCUTOFF-NEXT: ; kill: killed $sgpr0_sgpr1 +; EXACTCUTOFF-NEXT: ; kill: killed $sgpr4_sgpr5 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:32 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[12:15], v16, s[4:5] offset:32 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[4:7], v16, s[4:5] offset:48 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) +; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(1) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v13, v13, v13 +; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v7, v7, v7 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v16, s[4:5] +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v6, v6, v6 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v12, v12, v12 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v15, v15, v15 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v14, v14, v14 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:32 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v3, v3, v3 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v2, v2, v2 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v1, v1, v1 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v0, v0, v0 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:112 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v3, v3, v3 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v2, v2, v2 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v1, v1, v1 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v0, v0, v0 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:112 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:96 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v16, s[4:5] offset:112 ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v3, v3, v3 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v2, v2, v2 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v1, v1, v1 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v0, v0, v0 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:96 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:80 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] offset:112 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v16, s[4:5] offset:96 ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v3, v3, v3 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v2, v2, v2 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v1, v1, v1 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v0, v0, v0 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:80 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:48 -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] offset:96 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v5, v5, v5 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v4, v4, v4 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:48 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[4:7], v16, s[4:5] offset:64 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) @@ -531,33 +518,40 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v6, v6, v6 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v5, v5, v5 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v4, v4, v4 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:48 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:16 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:64 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:32 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[8:11], v16, s[4:5] offset:16 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v9, v9, v9 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v8, v8, v8 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v11, v11, v11 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v10, v10, v10 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:16 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:64 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v9, v9, v9 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v8, v8, v8 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v11, v11, v11 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v10, v10, v10 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:16 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[8:11], v16, s[4:5] offset:80 +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v11, v11, v11 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v10, v10, v10 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v9, v9, v9 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v8, v8, v8 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:64 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:80 +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #2 @@ -620,11 +614,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_MFMA_cluster: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_add_u32_e32 v1, s0, v0 +; GCN-NEXT: v_add_u32_e32 v1, s2, v0 ; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:112 ; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:96 ; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:80 @@ -668,7 +661,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad ; GCN-NEXT: ds_read_b128 a[136:139], v2 offset:57376 ; GCN-NEXT: ds_read_b128 a[140:143], v2 offset:57392 ; GCN-NEXT: v_mov_b32_e32 v2, 2.0 -; GCN-NEXT: v_add_u32_e32 v0, s1, v0 +; GCN-NEXT: v_add_u32_e32 v0, s3, v0 ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(40) SyncID(0) ; GCN-NEXT: s_waitcnt lgkmcnt(14) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] @@ -688,7 +681,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad ; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32 ; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:16 ; GCN-NEXT: ds_write_b128 v0, a[0:3] -; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:8288 ; GCN-NEXT: ds_write_b128 v0, a[60:63] offset:8304 ; GCN-NEXT: ds_write_b128 v0, a[48:51] offset:8256 @@ -727,11 +720,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_MFMA_cluster: ; EXACTCUTOFF: ; %bb.0: ; %entry -; EXACTCUTOFF-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; EXACTCUTOFF-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s0, v0 +; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s2, v0 ; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:112 ; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:96 ; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:80 @@ -775,7 +767,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad ; EXACTCUTOFF-NEXT: ds_read_b128 a[136:139], v2 offset:57376 ; EXACTCUTOFF-NEXT: ds_read_b128 a[140:143], v2 offset:57392 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v2, 2.0 -; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s1, v0 +; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s3, v0 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(40) SyncID(0) ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(14) ; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] @@ -795,7 +787,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:32 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:16 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v0, s1 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v0, s3 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[56:59] offset:8288 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[60:63] offset:8304 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[48:51] offset:8256 @@ -870,13 +862,12 @@ entry: define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0x1ff80, v0 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 7, v0 ; GCN-NEXT: v_mov_b32_e32 v2, 1.0 ; GCN-NEXT: v_mov_b32_e32 v3, 2.0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_add_u32_e32 v0, s0, v1 +; GCN-NEXT: v_add_u32_e32 v0, s2, v1 ; GCN-NEXT: ds_read_b128 a[28:31], v0 offset:112 ; GCN-NEXT: ds_read_b128 a[24:27], v0 offset:96 ; GCN-NEXT: ds_read_b128 a[20:23], v0 offset:80 @@ -887,7 +878,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-NEXT: ds_read_b128 a[12:15], v0 offset:48 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] -; GCN-NEXT: v_add_u32_e32 v1, s1, v1 +; GCN-NEXT: v_add_u32_e32 v1, s3, v1 ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-NEXT: s_nop 7 @@ -911,7 +902,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-NEXT: ds_read_b128 a[0:3], v0 offset:8192 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] -; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) @@ -1004,13 +995,12 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave: ; EXACTCUTOFF: ; %bb.0: ; %entry -; EXACTCUTOFF-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; EXACTCUTOFF-NEXT: v_and_b32_e32 v1, 0x1ff80, v0 +; EXACTCUTOFF-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v1, 7, v0 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v2, 1.0 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v3, 2.0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s0, v1 +; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s2, v1 ; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v0 offset:112 ; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v0 offset:96 ; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v0 offset:80 @@ -1021,7 +1011,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v0 offset:48 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] -; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s1, v1 +; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s3, v1 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_nop 7 @@ -1045,7 +1035,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v0 offset:8192 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v1, s1 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v1, s3 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) @@ -1198,9 +1188,9 @@ entry: define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out, <5 x float> %in1) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_interleave_EXP_MFMA: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x44 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 ; GCN-NEXT: v_mov_b32_e32 v3, 0x3fb8aa3b -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v7, 0x32a5705f ; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -1212,8 +1202,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; GCN-NEXT: v_add_f32_e32 v4, v6, v4 ; GCN-NEXT: v_exp_f32_e32 v4, v4 ; GCN-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GCN-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 -; GCN-NEXT: v_add_u32_e32 v1, s0, v0 +; GCN-NEXT: v_add_u32_e32 v1, s2, v0 ; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:112 ; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:96 ; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:80 @@ -1288,7 +1277,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; GCN-NEXT: v_mul_f32_e32 v4, s7, v3 ; GCN-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc ; GCN-NEXT: v_rndne_f32_e32 v10, v4 -; GCN-NEXT: s_load_dword s8, s[2:3], 0x54 +; GCN-NEXT: s_load_dword s8, s[0:1], 0x54 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v9, v1, a[64:95] ; GCN-NEXT: v_sub_f32_e32 v1, v4, v10 @@ -1324,7 +1313,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s8, v6 ; GCN-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; GCN-NEXT: v_add_u32_e32 v0, s1, v0 +; GCN-NEXT: v_add_u32_e32 v0, s3, v0 ; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:112 ; GCN-NEXT: s_waitcnt lgkmcnt(1) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v9, v1, a[128:159] @@ -1335,8 +1324,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32 ; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:16 ; GCN-NEXT: ds_write_b128 v0, a[0:3] -; GCN-NEXT: v_mov_b32_e32 v0, s1 -; GCN-NEXT: ; kill: killed $sgpr2_sgpr3 +; GCN-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NEXT: ; kill: killed $sgpr0_sgpr1 ; GCN-NEXT: ; sched_group_barrier mask(0x00000400) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000400) size(1) SyncID(0) @@ -1383,9 +1372,9 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_interleave_EXP_MFMA: ; EXACTCUTOFF: ; %bb.0: ; %entry -; EXACTCUTOFF-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x44 +; EXACTCUTOFF-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v3, 0x3fb8aa3b -; EXACTCUTOFF-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; EXACTCUTOFF-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v7, 0x32a5705f ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) @@ -1397,8 +1386,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; EXACTCUTOFF-NEXT: v_add_f32_e32 v4, v6, v4 ; EXACTCUTOFF-NEXT: v_exp_f32_e32 v4, v4 ; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v5, v5 -; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 -; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s0, v0 +; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s2, v0 ; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:112 ; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:96 ; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:80 @@ -1473,7 +1461,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; EXACTCUTOFF-NEXT: v_mul_f32_e32 v4, s7, v3 ; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc ; EXACTCUTOFF-NEXT: v_rndne_f32_e32 v10, v4 -; EXACTCUTOFF-NEXT: s_load_dword s8, s[2:3], 0x54 +; EXACTCUTOFF-NEXT: s_load_dword s8, s[0:1], 0x54 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v9, v1, a[64:95] ; EXACTCUTOFF-NEXT: v_sub_f32_e32 v1, v4, v10 @@ -1509,7 +1497,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; EXACTCUTOFF-NEXT: v_cmp_ngt_f32_e32 vcc, s8, v6 ; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s1, v0 +; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s3, v0 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:112 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(1) ; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v9, v1, a[128:159] @@ -1520,8 +1508,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:32 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:16 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v0, s1 -; EXACTCUTOFF-NEXT: ; kill: killed $sgpr2_sgpr3 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v0, s3 +; EXACTCUTOFF-NEXT: ; kill: killed $sgpr0_sgpr1 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000400) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000400) size(1) SyncID(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll index 114d2d099ab7b..fc33206845a71 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll @@ -5,8 +5,8 @@ define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) { ; GCN-LABEL: set_inactive: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -24,7 +24,7 @@ define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) { define amdgpu_kernel void @set_inactive_imm_poison(ptr addrspace(1) %out) { ; GCN-LABEL: set_inactive_imm_poison: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v0, 1 @@ -39,7 +39,7 @@ define amdgpu_kernel void @set_inactive_imm_poison(ptr addrspace(1) %out) { define amdgpu_kernel void @set_inactive_64(ptr addrspace(1) %out, i64 %in) { ; GCN-LABEL: set_inactive_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -61,7 +61,7 @@ define amdgpu_kernel void @set_inactive_64(ptr addrspace(1) %out, i64 %in) { define amdgpu_kernel void @set_inactive_imm_poison_64(ptr addrspace(1) %out) { ; GCN-LABEL: set_inactive_imm_poison_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 @@ -77,17 +77,17 @@ define amdgpu_kernel void @set_inactive_imm_poison_64(ptr addrspace(1) %out) { define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x i32> inreg %desc) { ; GCN-LABEL: set_inactive_scc: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GCN-NEXT: s_load_dword s8, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_buffer_load_dword s4, s[4:7], 0x0 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NEXT: s_buffer_load_dword s3, s[4:7], 0x0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 42 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s4, 56 +; GCN-NEXT: s_cmp_lg_u32 s3, 56 ; GCN-NEXT: s_mov_b64 s[2:3], -1 ; GCN-NEXT: s_cbranch_scc1 .LBB4_3 ; GCN-NEXT: ; %bb.1: ; %Flow @@ -127,8 +127,8 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) { ; GCN-LABEL: set_inactive_f32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_mov_b32 s5, 0x40400000 @@ -147,7 +147,7 @@ define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) { ; GCN-LABEL: set_inactive_f64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -171,8 +171,8 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) { define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %in) { ; GCN-LABEL: set_inactive_v2i16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_mov_b32 s5, 0x10001 @@ -191,8 +191,8 @@ define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> % define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> %in) { ; GCN-LABEL: set_inactive_v2f16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_mov_b32 s5, 0x3c003c00 @@ -211,7 +211,7 @@ define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> %in) { ; GCN-LABEL: set_inactive_v2i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s8, 1 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 @@ -235,7 +235,7 @@ define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> % define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; GCN-LABEL: set_inactive_v2f32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s8, 1.0 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 @@ -259,8 +259,8 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float> define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in) { ; GCN-LABEL: set_inactive_v2bf16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_mov_b32 s5, 0x3f803f80 @@ -279,7 +279,7 @@ define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloa define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> %in) { ; GCN-LABEL: set_inactive_v4i16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s8, 0x10001 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 @@ -303,7 +303,7 @@ define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> % define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half> %in) { ; GCN-LABEL: set_inactive_v4f16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s8, 0x3c003c00 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 @@ -327,7 +327,7 @@ define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half> define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloat> %in) { ; GCN-LABEL: set_inactive_v4bf16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s8, 0x3f803f80 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 @@ -351,7 +351,7 @@ define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloa define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) { ; GCN-LABEL: set_inactive_p0: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -373,8 +373,8 @@ define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) { define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(2) %in) { ; GCN-LABEL: set_inactive_p2: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -392,8 +392,8 @@ define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(3) %in) { ; GCN-LABEL: set_inactive_p3: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -411,8 +411,8 @@ define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(5) %in) { ; GCN-LABEL: set_inactive_p5: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -430,8 +430,8 @@ define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @set_inactive_p6(ptr addrspace(1) %out, ptr addrspace(6) %in) { ; GCN-LABEL: set_inactive_p6: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll index 279a64adfbda1..0755dcddd8f46 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @bfe_u32_arg_arg_arg(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 { ; SI-LABEL: bfe_u32_arg_arg_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -18,7 +18,7 @@ define amdgpu_kernel void @bfe_u32_arg_arg_arg(ptr addrspace(1) %out, i32 %src0, ; ; VI-LABEL: bfe_u32_arg_arg_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -36,7 +36,7 @@ define amdgpu_kernel void @bfe_u32_arg_arg_arg(ptr addrspace(1) %out, i32 %src0, define amdgpu_kernel void @bfe_u32_arg_arg_imm(ptr addrspace(1) %out, i32 %src0, i32 %src1) #0 { ; SI-LABEL: bfe_u32_arg_arg_imm: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x7b @@ -50,7 +50,7 @@ define amdgpu_kernel void @bfe_u32_arg_arg_imm(ptr addrspace(1) %out, i32 %src0, ; ; VI-LABEL: bfe_u32_arg_arg_imm: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v1, 0x7b ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -69,7 +69,7 @@ define amdgpu_kernel void @bfe_u32_arg_arg_imm(ptr addrspace(1) %out, i32 %src0, define amdgpu_kernel void @bfe_u32_arg_imm_arg(ptr addrspace(1) %out, i32 %src0, i32 %src2) #0 { ; SI-LABEL: bfe_u32_arg_imm_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x7b @@ -83,7 +83,7 @@ define amdgpu_kernel void @bfe_u32_arg_imm_arg(ptr addrspace(1) %out, i32 %src0, ; ; VI-LABEL: bfe_u32_arg_imm_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x7b ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -102,7 +102,7 @@ define amdgpu_kernel void @bfe_u32_arg_imm_arg(ptr addrspace(1) %out, i32 %src0, define amdgpu_kernel void @bfe_u32_imm_arg_arg(ptr addrspace(1) %out, i32 %src1, i32 %src2) #0 { ; SI-LABEL: bfe_u32_imm_arg_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_movk_i32 s8, 0x7b @@ -117,7 +117,7 @@ define amdgpu_kernel void @bfe_u32_imm_arg_arg(ptr addrspace(1) %out, i32 %src1, ; ; VI-LABEL: bfe_u32_imm_arg_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_movk_i32 s8, 0x7b ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -137,7 +137,7 @@ define amdgpu_kernel void @bfe_u32_imm_arg_arg(ptr addrspace(1) %out, i32 %src1, define amdgpu_kernel void @bfe_u32_arg_0_width_reg_offset(ptr addrspace(1) %out, i32 %src0, i32 %src1) #0 { ; SI-LABEL: bfe_u32_arg_0_width_reg_offset: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -147,7 +147,7 @@ define amdgpu_kernel void @bfe_u32_arg_0_width_reg_offset(ptr addrspace(1) %out, ; ; VI-LABEL: bfe_u32_arg_0_width_reg_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -162,7 +162,7 @@ define amdgpu_kernel void @bfe_u32_arg_0_width_reg_offset(ptr addrspace(1) %out, define amdgpu_kernel void @bfe_u32_arg_0_width_imm_offset(ptr addrspace(1) %out, i32 %src0, i32 %src1) #0 { ; SI-LABEL: bfe_u32_arg_0_width_imm_offset: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -172,7 +172,7 @@ define amdgpu_kernel void @bfe_u32_arg_0_width_imm_offset(ptr addrspace(1) %out, ; ; VI-LABEL: bfe_u32_arg_0_width_imm_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -187,7 +187,7 @@ define amdgpu_kernel void @bfe_u32_arg_0_width_imm_offset(ptr addrspace(1) %out, define amdgpu_kernel void @bfe_u32_zextload_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_zextload_i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -204,7 +204,7 @@ define amdgpu_kernel void @bfe_u32_zextload_i8(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: bfe_u32_zextload_i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -229,7 +229,7 @@ define amdgpu_kernel void @bfe_u32_zextload_i8(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @bfe_u32_zext_in_reg_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_zext_in_reg_i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -248,7 +248,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8(ptr addrspace(1) %out, ptr add ; ; VI-LABEL: bfe_u32_zext_in_reg_i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -275,7 +275,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @bfe_u32_zext_in_reg_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_zext_in_reg_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -294,7 +294,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i16(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: bfe_u32_zext_in_reg_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -321,7 +321,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i16(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_zext_in_reg_i8_offset_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -341,7 +341,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_1(ptr addrspace(1) %out ; ; VI-LABEL: bfe_u32_zext_in_reg_i8_offset_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -369,7 +369,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_1(ptr addrspace(1) %out define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_3(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_zext_in_reg_i8_offset_3: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -389,7 +389,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_3(ptr addrspace(1) %out ; ; VI-LABEL: bfe_u32_zext_in_reg_i8_offset_3: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -417,7 +417,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_3(ptr addrspace(1) %out define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_7(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_zext_in_reg_i8_offset_7: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -437,7 +437,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_7(ptr addrspace(1) %out ; ; VI-LABEL: bfe_u32_zext_in_reg_i8_offset_7: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -465,7 +465,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_7(ptr addrspace(1) %out define amdgpu_kernel void @bfe_u32_zext_in_reg_i16_offset_8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_zext_in_reg_i16_offset_8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -484,7 +484,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i16_offset_8(ptr addrspace(1) %ou ; ; VI-LABEL: bfe_u32_zext_in_reg_i16_offset_8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -511,7 +511,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i16_offset_8(ptr addrspace(1) %ou define amdgpu_kernel void @bfe_u32_test_1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -529,7 +529,7 @@ define amdgpu_kernel void @bfe_u32_test_1(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: bfe_u32_test_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -553,7 +553,7 @@ define amdgpu_kernel void @bfe_u32_test_1(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -563,7 +563,7 @@ define amdgpu_kernel void @bfe_u32_test_2(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: bfe_u32_test_2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -580,7 +580,7 @@ define amdgpu_kernel void @bfe_u32_test_2(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_3(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_3: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -590,7 +590,7 @@ define amdgpu_kernel void @bfe_u32_test_3(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: bfe_u32_test_3: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -607,7 +607,7 @@ define amdgpu_kernel void @bfe_u32_test_3(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_4(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_4: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -617,7 +617,7 @@ define amdgpu_kernel void @bfe_u32_test_4(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: bfe_u32_test_4: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -635,7 +635,7 @@ define amdgpu_kernel void @bfe_u32_test_4(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_5(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_5: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -653,7 +653,7 @@ define amdgpu_kernel void @bfe_u32_test_5(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: bfe_u32_test_5: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -679,7 +679,7 @@ define amdgpu_kernel void @bfe_u32_test_5(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_6(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_6: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -698,7 +698,7 @@ define amdgpu_kernel void @bfe_u32_test_6(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: bfe_u32_test_6: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -724,7 +724,7 @@ define amdgpu_kernel void @bfe_u32_test_6(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_7(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_7: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -742,7 +742,7 @@ define amdgpu_kernel void @bfe_u32_test_7(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: bfe_u32_test_7: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -767,7 +767,7 @@ define amdgpu_kernel void @bfe_u32_test_7(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -785,7 +785,7 @@ define amdgpu_kernel void @bfe_u32_test_8(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: bfe_u32_test_8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -810,7 +810,7 @@ define amdgpu_kernel void @bfe_u32_test_8(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_9(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_9: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -828,7 +828,7 @@ define amdgpu_kernel void @bfe_u32_test_9(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: bfe_u32_test_9: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -852,7 +852,7 @@ define amdgpu_kernel void @bfe_u32_test_9(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_10(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_10: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -870,7 +870,7 @@ define amdgpu_kernel void @bfe_u32_test_10(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: bfe_u32_test_10: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -894,7 +894,7 @@ define amdgpu_kernel void @bfe_u32_test_10(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_u32_test_11(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_11: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -912,7 +912,7 @@ define amdgpu_kernel void @bfe_u32_test_11(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: bfe_u32_test_11: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -936,7 +936,7 @@ define amdgpu_kernel void @bfe_u32_test_11(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_u32_test_12(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_12: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -954,7 +954,7 @@ define amdgpu_kernel void @bfe_u32_test_12(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: bfe_u32_test_12: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -979,7 +979,7 @@ define amdgpu_kernel void @bfe_u32_test_12(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_u32_test_13(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_13: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -997,7 +997,7 @@ define amdgpu_kernel void @bfe_u32_test_13(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: bfe_u32_test_13: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1021,7 +1021,7 @@ define amdgpu_kernel void @bfe_u32_test_13(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_u32_test_14(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_14: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -1031,7 +1031,7 @@ define amdgpu_kernel void @bfe_u32_test_14(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: bfe_u32_test_14: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -1047,7 +1047,7 @@ define amdgpu_kernel void @bfe_u32_test_14(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_u32_constant_fold_test_0(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1057,7 +1057,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_0(ptr addrspace(1) %out) # ; ; VI-LABEL: bfe_u32_constant_fold_test_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1072,7 +1072,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_0(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_1(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1082,7 +1082,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_1(ptr addrspace(1) %out) # ; ; VI-LABEL: bfe_u32_constant_fold_test_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1097,7 +1097,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_1(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_2(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1107,7 +1107,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_2(ptr addrspace(1) %out) # ; ; VI-LABEL: bfe_u32_constant_fold_test_2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1122,7 +1122,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_2(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_3(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_3: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 1 @@ -1132,7 +1132,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_3(ptr addrspace(1) %out) # ; ; VI-LABEL: bfe_u32_constant_fold_test_3: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 1 @@ -1147,7 +1147,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_3(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_4(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_4: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, -1 @@ -1157,7 +1157,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_4(ptr addrspace(1) %out) # ; ; VI-LABEL: bfe_u32_constant_fold_test_4: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, -1 @@ -1172,7 +1172,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_4(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_5(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_5: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 1 @@ -1182,7 +1182,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_5(ptr addrspace(1) %out) # ; ; VI-LABEL: bfe_u32_constant_fold_test_5: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 1 @@ -1197,7 +1197,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_5(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_6(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_6: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x80 @@ -1207,7 +1207,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_6(ptr addrspace(1) %out) # ; ; VI-LABEL: bfe_u32_constant_fold_test_6: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0x80 @@ -1222,7 +1222,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_6(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_7(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_7: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x7f @@ -1232,7 +1232,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_7(ptr addrspace(1) %out) # ; ; VI-LABEL: bfe_u32_constant_fold_test_7: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0x7f @@ -1247,7 +1247,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_7(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_8(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 1 @@ -1257,7 +1257,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_8(ptr addrspace(1) %out) # ; ; VI-LABEL: bfe_u32_constant_fold_test_8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 1 @@ -1272,7 +1272,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_8(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_9(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_9: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 1 @@ -1282,7 +1282,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_9(ptr addrspace(1) %out) # ; ; VI-LABEL: bfe_u32_constant_fold_test_9: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 1 @@ -1297,7 +1297,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_9(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_10(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_10: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1307,7 +1307,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_10(ptr addrspace(1) %out) ; ; VI-LABEL: bfe_u32_constant_fold_test_10: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1322,7 +1322,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_10(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_11(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_11: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 10 @@ -1332,7 +1332,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_11(ptr addrspace(1) %out) ; ; VI-LABEL: bfe_u32_constant_fold_test_11: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 10 @@ -1347,7 +1347,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_11(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_12(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_12: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1357,7 +1357,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_12(ptr addrspace(1) %out) ; ; VI-LABEL: bfe_u32_constant_fold_test_12: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1372,7 +1372,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_12(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_13(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_13: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 1 @@ -1382,7 +1382,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_13(ptr addrspace(1) %out) ; ; VI-LABEL: bfe_u32_constant_fold_test_13: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 1 @@ -1397,7 +1397,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_13(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_14(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_14: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 40 @@ -1407,7 +1407,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_14(ptr addrspace(1) %out) ; ; VI-LABEL: bfe_u32_constant_fold_test_14: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 40 @@ -1422,7 +1422,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_14(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_15(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_15: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 10 @@ -1432,7 +1432,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_15(ptr addrspace(1) %out) ; ; VI-LABEL: bfe_u32_constant_fold_test_15: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 10 @@ -1447,7 +1447,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_15(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_16(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x7f @@ -1457,7 +1457,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_16(ptr addrspace(1) %out) ; ; VI-LABEL: bfe_u32_constant_fold_test_16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0x7f @@ -1472,7 +1472,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_16(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_17(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_17: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x7f @@ -1482,7 +1482,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_17(ptr addrspace(1) %out) ; ; VI-LABEL: bfe_u32_constant_fold_test_17: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0x7f @@ -1497,7 +1497,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_17(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_18(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_18: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1507,7 +1507,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_18(ptr addrspace(1) %out) ; ; VI-LABEL: bfe_u32_constant_fold_test_18: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1526,45 +1526,47 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_18(ptr addrspace(1) %out) define amdgpu_kernel void @simplify_bfe_u32_multi_use_arg(ptr addrspace(1) %out0, ; SI-LABEL: simplify_bfe_u32_multi_use_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s2, s6 -; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; SI-NEXT: s_mov_b32 s4, s8 -; SI-NEXT: s_mov_b32 s5, s9 -; SI-NEXT: s_mov_b32 s0, s10 -; SI-NEXT: s_mov_b32 s1, s11 +; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s0, s8 +; SI-NEXT: s_mov_b32 s1, s9 +; SI-NEXT: s_mov_b32 s4, s10 +; SI-NEXT: s_mov_b32 s5, s11 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v0, 63, v0 ; SI-NEXT: v_bfe_u32 v1, v0, 2, 2 -; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: simplify_bfe_u32_multi_use_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s0 -; VI-NEXT: s_mov_b32 s9, s1 -; VI-NEXT: s_mov_b32 s4, s2 -; VI-NEXT: s_mov_b32 s5, s3 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s8, s6 +; VI-NEXT: s_mov_b32 s9, s7 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v0, 63, v0 ; VI-NEXT: v_bfe_u32 v1, v0, 2, 2 -; VI-NEXT: buffer_store_dword v1, off, s[8:11], 0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ptr addrspace(1) %out1, ptr addrspace(1) %in) #0 { @@ -1579,11 +1581,11 @@ define amdgpu_kernel void @simplify_bfe_u32_multi_use_arg(ptr addrspace(1) %out0 define amdgpu_kernel void @lshr_and(ptr addrspace(1) %out, i32 %a) #0 { ; SI-LABEL: lshr_and: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_u32 s4, s4, 0x30006 +; SI-NEXT: s_bfe_u32 s4, s2, 0x30006 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -1591,8 +1593,8 @@ define amdgpu_kernel void @lshr_and(ptr addrspace(1) %out, i32 %a) #0 { ; ; VI-LABEL: lshr_and: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1609,7 +1611,7 @@ define amdgpu_kernel void @lshr_and(ptr addrspace(1) %out, i32 %a) #0 { define amdgpu_kernel void @v_lshr_and(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { ; SI-LABEL: v_lshr_and: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshr_b32 s2, s2, s3 @@ -1623,7 +1625,7 @@ define amdgpu_kernel void @v_lshr_and(ptr addrspace(1) %out, i32 %a, i32 %b) #0 ; ; VI-LABEL: v_lshr_and: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1643,11 +1645,11 @@ define amdgpu_kernel void @v_lshr_and(ptr addrspace(1) %out, i32 %a, i32 %b) #0 define amdgpu_kernel void @and_lshr(ptr addrspace(1) %out, i32 %a) #0 { ; SI-LABEL: and_lshr: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_u32 s4, s4, 0x30006 +; SI-NEXT: s_bfe_u32 s4, s2, 0x30006 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -1655,8 +1657,8 @@ define amdgpu_kernel void @and_lshr(ptr addrspace(1) %out, i32 %a) #0 { ; ; VI-LABEL: and_lshr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1673,11 +1675,11 @@ define amdgpu_kernel void @and_lshr(ptr addrspace(1) %out, i32 %a) #0 { define amdgpu_kernel void @and_lshr2(ptr addrspace(1) %out, i32 %a) #0 { ; SI-LABEL: and_lshr2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_u32 s4, s4, 0x30006 +; SI-NEXT: s_bfe_u32 s4, s2, 0x30006 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -1685,8 +1687,8 @@ define amdgpu_kernel void @and_lshr2(ptr addrspace(1) %out, i32 %a) #0 { ; ; VI-LABEL: and_lshr2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1703,11 +1705,11 @@ define amdgpu_kernel void @and_lshr2(ptr addrspace(1) %out, i32 %a) #0 { define amdgpu_kernel void @shl_lshr(ptr addrspace(1) %out, i32 %a) #0 { ; SI-LABEL: shl_lshr: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_u32 s4, s4, 0x150002 +; SI-NEXT: s_bfe_u32 s4, s2, 0x150002 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -1715,8 +1717,8 @@ define amdgpu_kernel void @shl_lshr(ptr addrspace(1) %out, i32 %a) #0 { ; ; VI-LABEL: shl_lshr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll index 142145098df87..4ce0ff20e3b73 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll @@ -12,34 +12,33 @@ define amdgpu_kernel void @s_exp_f32(ptr addrspace(1) %out, float %in) { ; VI-SDAG-LABEL: s_exp_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8a000 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: s_and_b32 s0, s4, 0xfffff000 -; VI-SDAG-NEXT: v_mov_b32_e32 v1, s0 -; VI-SDAG-NEXT: v_sub_f32_e32 v1, s4, v1 +; VI-SDAG-NEXT: s_and_b32 s3, s2, 0xfffff000 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; VI-SDAG-NEXT: v_sub_f32_e32 v1, s2, v1 ; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x39a3b295, v1 ; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3fb8a000, v1 -; VI-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v0, s3, v0 ; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v3 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x39a3b295 ; VI-SDAG-NEXT: v_rndne_f32_e32 v2, v0 -; VI-SDAG-NEXT: v_mul_f32_e32 v3, s0, v3 +; VI-SDAG-NEXT: v_mul_f32_e32 v3, s3, v3 ; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v2 ; VI-SDAG-NEXT: v_add_f32_e32 v1, v3, v1 ; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 ; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v1, v2 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x7f800000 ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc2ce8ed0 -; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v1 +; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v1 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42b17218 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v1 +; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v1 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc -; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 @@ -47,34 +46,33 @@ define amdgpu_kernel void @s_exp_f32(ptr addrspace(1) %out, float %in) { ; ; VI-GISEL-LABEL: s_exp_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x3fb8a000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x39a3b295 +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: s_and_b32 s0, s4, 0xfffff000 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 -; VI-GISEL-NEXT: v_sub_f32_e32 v2, s4, v2 +; VI-GISEL-NEXT: s_and_b32 s3, s2, 0xfffff000 +; VI-GISEL-NEXT: v_mov_b32_e32 v2, s3 +; VI-GISEL-NEXT: v_sub_f32_e32 v2, s2, v2 ; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x39a3b295, v2 ; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3fb8a000, v2 -; VI-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 +; VI-GISEL-NEXT: v_mul_f32_e32 v0, s3, v0 ; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 -; VI-GISEL-NEXT: v_mul_f32_e32 v1, s0, v1 +; VI-GISEL-NEXT: v_mul_f32_e32 v1, s3, v1 ; VI-GISEL-NEXT: v_add_f32_e32 v1, v1, v2 ; VI-GISEL-NEXT: v_rndne_f32_e32 v2, v0 ; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v2 ; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 ; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v2 ; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 ; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2ce8ed0 -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42b17218 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 +; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v1 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc -; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 @@ -82,16 +80,16 @@ define amdgpu_kernel void @s_exp_f32(ptr addrspace(1) %out, float %in) { ; ; GFX900-SDAG-LABEL: s_exp_f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x32a5705f +; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s4, v0 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s2, v0 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v3, v2 -; GFX900-SDAG-NEXT: v_fma_f32 v0, s4, v0, -v2 +; GFX900-SDAG-NEXT: v_fma_f32 v0, s2, v0, -v2 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX900-SDAG-NEXT: v_fma_f32 v0, s4, v1, v0 +; GFX900-SDAG-NEXT: v_fma_f32 v0, s2, v1, v0 ; GFX900-SDAG-NEXT: v_add_f32_e32 v0, v2, v0 ; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v1, v3 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0 @@ -99,36 +97,36 @@ define amdgpu_kernel void @s_exp_f32(ptr addrspace(1) %out, float %in) { ; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0xc2ce8ed0 -; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v1 +; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v1 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42b17218 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v1 +; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v1 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc ; GFX900-SDAG-NEXT: global_store_dword v2, v0, s[0:1] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_exp_f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x32a5705f +; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s4, v0 -; GFX900-GISEL-NEXT: v_fma_f32 v0, s4, v0, -v2 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s2, v0 +; GFX900-GISEL-NEXT: v_fma_f32 v0, s2, v0, -v2 ; GFX900-GISEL-NEXT: v_rndne_f32_e32 v3, v2 -; GFX900-GISEL-NEXT: v_fma_f32 v0, s4, v1, v0 +; GFX900-GISEL-NEXT: v_fma_f32 v0, s2, v1, v0 ; GFX900-GISEL-NEXT: v_sub_f32_e32 v1, v2, v3 ; GFX900-GISEL-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX900-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v3 ; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0xc2ce8ed0 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v2 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v2 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 ; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42b17218 ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v1 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[0:1] @@ -136,10 +134,10 @@ define amdgpu_kernel void @s_exp_f32(ptr addrspace(1) %out, float %in) { ; ; SI-SDAG-LABEL: s_exp_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dword s4, s[0:1], 0xb ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x32a5705f +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SI-SDAG-NEXT: v_mul_f32_e32 v2, s4, v0 @@ -164,29 +162,29 @@ define amdgpu_kernel void @s_exp_f32(ptr addrspace(1) %out, float %in) { ; ; SI-GISEL-LABEL: s_exp_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dword s2, s[0:1], 0xb ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x32a5705f -; SI-GISEL-NEXT: s_mov_b32 s2, -1 +; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_mul_f32_e32 v2, s4, v0 -; SI-GISEL-NEXT: v_fma_f32 v0, s4, v0, -v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v2, s2, v0 +; SI-GISEL-NEXT: v_fma_f32 v0, s2, v0, -v2 ; SI-GISEL-NEXT: v_rndne_f32_e32 v3, v2 -; SI-GISEL-NEXT: v_fma_f32 v0, s4, v1, v0 +; SI-GISEL-NEXT: v_fma_f32 v0, s2, v1, v0 ; SI-GISEL-NEXT: v_sub_f32_e32 v1, v2, v3 ; SI-GISEL-NEXT: v_add_f32_e32 v0, v1, v0 ; SI-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v3 ; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0xc2ce8ed0 -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v2 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v2 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 ; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42b17218 ; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 +; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v1 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: s_mov_b32 s2, -1 ; SI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-GISEL-NEXT: s_endpgm ; @@ -338,7 +336,7 @@ define amdgpu_kernel void @s_exp_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; VI-SDAG-LABEL: s_exp_v2f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8a000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: s_and_b32 s4, s3, 0xfffff000 @@ -390,7 +388,7 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; VI-GISEL-LABEL: s_exp_v2f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x3fb8a000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x39a3b295 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -442,59 +440,59 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; GFX900-SDAG-LABEL: s_exp_v2f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x32a5705f ; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0xc2ce8ed0 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s3, v0 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s7, v0 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v3, v2 -; GFX900-SDAG-NEXT: v_fma_f32 v4, s3, v0, -v2 +; GFX900-SDAG-NEXT: v_fma_f32 v4, s7, v0, -v2 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX900-SDAG-NEXT: v_fma_f32 v4, s3, v1, v4 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v6, s2, v0 +; GFX900-SDAG-NEXT: v_fma_f32 v4, s7, v1, v4 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v6, s6, v0 ; GFX900-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v7, v6 -; GFX900-SDAG-NEXT: v_fma_f32 v0, s2, v0, -v6 +; GFX900-SDAG-NEXT: v_fma_f32 v0, s6, v0, -v6 ; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v2, v2 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v8, v6, v7 -; GFX900-SDAG-NEXT: v_fma_f32 v0, s2, v1, v0 +; GFX900-SDAG-NEXT: v_fma_f32 v0, s6, v1, v0 ; GFX900-SDAG-NEXT: v_add_f32_e32 v0, v8, v0 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v6, v7 ; GFX900-SDAG-NEXT: v_ldexp_f32 v2, v2, v3 -; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s3, v5 +; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v5 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0x42b17218 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; GFX900-SDAG-NEXT: v_mov_b32_e32 v7, 0x7f800000 -; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s3, v3 +; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s7, v3 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v6 -; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v5 +; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s6, v5 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v3 +; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v3 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc -; GFX900-SDAG-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; GFX900-SDAG-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_exp_v2f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x32a5705f ; GFX900-GISEL-NEXT: v_mov_b32_e32 v6, 0x7f800000 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s2, v0 -; GFX900-GISEL-NEXT: v_fma_f32 v3, s2, v0, -v2 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s6, v0 +; GFX900-GISEL-NEXT: v_fma_f32 v3, s6, v0, -v2 ; GFX900-GISEL-NEXT: v_rndne_f32_e32 v4, v2 -; GFX900-GISEL-NEXT: v_mul_f32_e32 v5, s3, v0 -; GFX900-GISEL-NEXT: v_fma_f32 v3, s2, v1, v3 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v5, s7, v0 +; GFX900-GISEL-NEXT: v_fma_f32 v3, s6, v1, v3 ; GFX900-GISEL-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX900-GISEL-NEXT: v_fma_f32 v0, s3, v0, -v5 +; GFX900-GISEL-NEXT: v_fma_f32 v0, s7, v0, -v5 ; GFX900-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 -; GFX900-GISEL-NEXT: v_fma_f32 v0, s3, v1, v0 +; GFX900-GISEL-NEXT: v_fma_f32 v0, s7, v1, v0 ; GFX900-GISEL-NEXT: v_rndne_f32_e32 v1, v5 ; GFX900-GISEL-NEXT: v_cvt_i32_f32_e32 v3, v4 ; GFX900-GISEL-NEXT: v_exp_f32_e32 v2, v2 @@ -504,23 +502,23 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; GFX900-GISEL-NEXT: v_exp_f32_e32 v5, v0 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0xc2ce8ed0 ; GFX900-GISEL-NEXT: v_ldexp_f32 v2, v2, v3 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v4 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v4 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x42b17218 ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc -; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v3 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s6, v3 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc ; GFX900-GISEL-NEXT: v_ldexp_f32 v1, v5, v1 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s3, v4 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s7, v4 ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc -; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s3, v3 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s7, v3 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX900-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX900-GISEL-NEXT: s_endpgm ; ; SI-SDAG-LABEL: s_exp_v2f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x32a5705f ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 @@ -562,7 +560,7 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; SI-GISEL-LABEL: s_exp_v2f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x32a5705f ; SI-GISEL-NEXT: v_mov_b32_e32 v6, 0x7f800000 @@ -853,25 +851,25 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) { define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; VI-SDAG-LABEL: s_exp_v3f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8a000 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: s_and_b32 s0, s6, 0xfffff000 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; VI-SDAG-NEXT: s_and_b32 s2, s6, 0xfffff000 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_sub_f32_e32 v2, s6, v2 ; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x39a3b295, v2 ; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3fb8a000, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v1, s0, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v1, s2, v0 ; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x39a3b295 ; VI-SDAG-NEXT: v_rndne_f32_e32 v3, v1 -; VI-SDAG-NEXT: v_mul_f32_e32 v5, s0, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v5, s2, v4 ; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v3 ; VI-SDAG-NEXT: v_add_f32_e32 v2, v5, v2 ; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 ; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1 ; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v2, v3 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: s_and_b32 s2, s5, 0xfffff000 ; VI-SDAG-NEXT: v_mov_b32_e32 v7, s2 ; VI-SDAG-NEXT: v_sub_f32_e32 v7, s5, v7 @@ -917,7 +915,6 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v3 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v5 -; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v4, s1 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s0 @@ -926,19 +923,19 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; ; VI-GISEL-LABEL: s_exp_v3f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3fb8a000 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x39a3b295 +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: s_and_b32 s0, s4, 0xfffff000 -; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: s_and_b32 s2, s4, 0xfffff000 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_sub_f32_e32 v0, s4, v0 ; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x39a3b295, v0 ; VI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0 -; VI-GISEL-NEXT: v_mul_f32_e32 v3, s0, v1 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, s2, v1 ; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v4 -; VI-GISEL-NEXT: v_mul_f32_e32 v4, s0, v2 -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: v_mul_f32_e32 v4, s2, v2 ; VI-GISEL-NEXT: s_and_b32 s2, s5, 0xfffff000 ; VI-GISEL-NEXT: v_mov_b32_e32 v5, s2 ; VI-GISEL-NEXT: v_sub_f32_e32 v5, s5, v5 @@ -990,7 +987,6 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v3 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc ; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s6, v4 -; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v4, s1 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s0 @@ -999,11 +995,11 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; ; GFX900-SDAG-LABEL: s_exp_v3f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x32a5705f ; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0x42b17218 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-SDAG-NEXT: v_mul_f32_e32 v6, s5, v0 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v7, v6 @@ -1052,10 +1048,10 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; ; GFX900-GISEL-LABEL: s_exp_v3f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3fb8aa3b ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x32a5705f +; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-GISEL-NEXT: v_mul_f32_e32 v5, s5, v1 ; GFX900-GISEL-NEXT: v_fma_f32 v6, s5, v1, -v5 @@ -1105,10 +1101,10 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; ; SI-SDAG-LABEL: s_exp_v3f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x32a5705f +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SI-SDAG-NEXT: v_mul_f32_e32 v5, s4, v0 @@ -1160,10 +1156,10 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; ; SI-GISEL-LABEL: s_exp_v3f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3fb8aa3b ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x32a5705f +; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-GISEL-NEXT: s_mov_b32 s2, -1 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: v_mul_f32_e32 v5, s5, v1 @@ -1594,26 +1590,26 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) { define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; VI-SDAG-LABEL: s_exp_v4f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8a000 ; VI-SDAG-NEXT: v_mov_b32_e32 v6, 0x42b17218 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: s_and_b32 s0, s7, 0xfffff000 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; VI-SDAG-NEXT: s_and_b32 s2, s7, 0xfffff000 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_sub_f32_e32 v2, s7, v2 ; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x39a3b295, v2 ; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3fb8a000, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v1, s0, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v1, s2, v0 ; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x39a3b295 ; VI-SDAG-NEXT: v_rndne_f32_e32 v3, v1 -; VI-SDAG-NEXT: v_mul_f32_e32 v5, s0, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v5, s2, v4 ; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v3 ; VI-SDAG-NEXT: v_add_f32_e32 v2, v5, v2 ; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 ; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1 ; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v2, v3 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: s_and_b32 s2, s6, 0xfffff000 ; VI-SDAG-NEXT: v_mov_b32_e32 v7, s2 ; VI-SDAG-NEXT: v_sub_f32_e32 v7, s6, v7 @@ -1677,7 +1673,6 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v5 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v6 -; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v5, s1 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc ; VI-SDAG-NEXT: v_mov_b32_e32 v4, s0 @@ -1686,28 +1681,29 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; ; VI-GISEL-LABEL: s_exp_v4f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x3fb8a000 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x39a3b295 ; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x42b17218 +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: s_and_b32 s0, s4, 0xfffff000 -; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: s_and_b32 s2, s4, 0xfffff000 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_sub_f32_e32 v0, s4, v0 ; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x39a3b295, v0 ; VI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0 -; VI-GISEL-NEXT: v_mul_f32_e32 v1, s0, v2 +; VI-GISEL-NEXT: v_mul_f32_e32 v1, s2, v2 ; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v4 -; VI-GISEL-NEXT: v_mul_f32_e32 v4, s0, v3 +; VI-GISEL-NEXT: v_mul_f32_e32 v4, s2, v3 ; VI-GISEL-NEXT: v_add_f32_e32 v0, v4, v0 ; VI-GISEL-NEXT: v_rndne_f32_e32 v4, v1 ; VI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v4 ; VI-GISEL-NEXT: v_add_f32_e32 v0, v1, v0 ; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v4 ; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: s_and_b32 s2, s5, 0xfffff000 ; VI-GISEL-NEXT: v_mul_f32_e32 v6, s2, v2 +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xc2ce8ed0 ; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; VI-GISEL-NEXT: v_sub_f32_e32 v1, s5, v1 @@ -1723,7 +1719,7 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; VI-GISEL-NEXT: v_exp_f32_e32 v1, v1 ; VI-GISEL-NEXT: s_and_b32 s2, s6, 0xfffff000 ; VI-GISEL-NEXT: v_mul_f32_e32 v8, s2, v2 -; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xc2ce8ed0 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v4 ; VI-GISEL-NEXT: v_ldexp_f32 v1, v1, v6 ; VI-GISEL-NEXT: v_mov_b32_e32 v6, s2 ; VI-GISEL-NEXT: v_sub_f32_e32 v6, s6, v6 @@ -1748,7 +1744,6 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; VI-GISEL-NEXT: v_add_f32_e32 v8, v8, v9 ; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v8 ; VI-GISEL-NEXT: v_rndne_f32_e32 v8, v2 -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v4 ; VI-GISEL-NEXT: v_sub_f32_e32 v2, v2, v8 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc ; VI-GISEL-NEXT: v_mov_b32_e32 v7, 0x7f800000 @@ -1769,7 +1764,6 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s7, v4 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc ; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s7, v5 -; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v5, s1 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc ; VI-GISEL-NEXT: v_mov_b32_e32 v4, s0 @@ -1778,11 +1772,11 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; ; GFX900-SDAG-LABEL: s_exp_v4f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x32a5705f ; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0xc2ce8ed0 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v6, 0x42b17218 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s7, v0 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v3, v2 @@ -1793,8 +1787,8 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v2, v2 ; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v5 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v6, 0x42b17218 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v9, 0x7f800000 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_ldexp_f32 v2, v2, v3 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, s6, v0 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v7, v3 @@ -1839,16 +1833,17 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v6 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc +; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_exp_v4f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x3fb8aa3b ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x32a5705f ; GFX900-GISEL-NEXT: v_mov_b32_e32 v5, 0x42b17218 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s4, v2 ; GFX900-GISEL-NEXT: v_fma_f32 v1, s4, v2, -v0 @@ -1910,11 +1905,11 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; ; SI-SDAG-LABEL: s_exp_v4f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x32a5705f ; SI-SDAG-NEXT: v_mov_b32_e32 v5, 0x42b17218 +; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0x7f800000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SI-SDAG-NEXT: v_mul_f32_e32 v2, s7, v0 ; SI-SDAG-NEXT: v_rndne_f32_e32 v3, v2 @@ -1926,7 +1921,7 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3 ; SI-SDAG-NEXT: v_mov_b32_e32 v4, 0xc2ce8ed0 ; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v4 -; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0x7f800000 +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-SDAG-NEXT: v_ldexp_f32_e32 v2, v2, v3 ; SI-SDAG-NEXT: v_mul_f32_e32 v3, s6, v0 ; SI-SDAG-NEXT: v_rndne_f32_e32 v6, v3 @@ -1972,16 +1967,17 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s2, -1 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-SDAG-NEXT: s_endpgm ; ; SI-GISEL-LABEL: s_exp_v4f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x3fb8aa3b ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x32a5705f ; SI-GISEL-NEXT: v_mov_b32_e32 v5, 0x42b17218 +; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: v_mul_f32_e32 v0, s4, v2 ; SI-GISEL-NEXT: v_fma_f32 v1, s4, v2, -v0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll index 4d981d27c309e..5ab960f47f57b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll @@ -14,34 +14,33 @@ define amdgpu_kernel void @s_exp10_f32(ptr addrspace(1) %out, float %in) { ; VI-SDAG-LABEL: s_exp10_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549000 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: s_and_b32 s0, s4, 0xfffff000 -; VI-SDAG-NEXT: v_mov_b32_e32 v1, s0 -; VI-SDAG-NEXT: v_sub_f32_e32 v1, s4, v1 +; VI-SDAG-NEXT: s_and_b32 s3, s2, 0xfffff000 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; VI-SDAG-NEXT: v_sub_f32_e32 v1, s2, v1 ; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3a2784bc, v1 ; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x40549000, v1 -; VI-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v0, s3, v0 ; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v3 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x3a2784bc ; VI-SDAG-NEXT: v_rndne_f32_e32 v2, v0 -; VI-SDAG-NEXT: v_mul_f32_e32 v3, s0, v3 +; VI-SDAG-NEXT: v_mul_f32_e32 v3, s3, v3 ; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v2 ; VI-SDAG-NEXT: v_add_f32_e32 v1, v3, v1 ; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 ; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v1, v2 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x7f800000 ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc23369f4 -; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v1 +; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v1 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x421a209b ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v1 +; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v1 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc -; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 @@ -49,34 +48,33 @@ define amdgpu_kernel void @s_exp10_f32(ptr addrspace(1) %out, float %in) { ; ; VI-GISEL-LABEL: s_exp10_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x40549000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3a2784bc +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: s_and_b32 s0, s4, 0xfffff000 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 -; VI-GISEL-NEXT: v_sub_f32_e32 v2, s4, v2 +; VI-GISEL-NEXT: s_and_b32 s3, s2, 0xfffff000 +; VI-GISEL-NEXT: v_mov_b32_e32 v2, s3 +; VI-GISEL-NEXT: v_sub_f32_e32 v2, s2, v2 ; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3a2784bc, v2 ; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x40549000, v2 -; VI-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 +; VI-GISEL-NEXT: v_mul_f32_e32 v0, s3, v0 ; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 -; VI-GISEL-NEXT: v_mul_f32_e32 v1, s0, v1 +; VI-GISEL-NEXT: v_mul_f32_e32 v1, s3, v1 ; VI-GISEL-NEXT: v_add_f32_e32 v1, v1, v2 ; VI-GISEL-NEXT: v_rndne_f32_e32 v2, v0 ; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v2 ; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 ; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v2 ; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 ; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc23369f4 -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x421a209b ; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 +; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v1 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc -; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 @@ -84,16 +82,16 @@ define amdgpu_kernel void @s_exp10_f32(ptr addrspace(1) %out, float %in) { ; ; GFX900-SDAG-LABEL: s_exp10_f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x33979a37 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s4, v0 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s2, v0 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v3, v2 -; GFX900-SDAG-NEXT: v_fma_f32 v0, s4, v0, -v2 +; GFX900-SDAG-NEXT: v_fma_f32 v0, s2, v0, -v2 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX900-SDAG-NEXT: v_fma_f32 v0, s4, v1, v0 +; GFX900-SDAG-NEXT: v_fma_f32 v0, s2, v1, v0 ; GFX900-SDAG-NEXT: v_add_f32_e32 v0, v2, v0 ; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v1, v3 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0 @@ -101,36 +99,36 @@ define amdgpu_kernel void @s_exp10_f32(ptr addrspace(1) %out, float %in) { ; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0xc23369f4 -; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v1 +; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v1 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x421a209b ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v1 +; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v1 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc ; GFX900-SDAG-NEXT: global_store_dword v2, v0, s[0:1] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_exp10_f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x33979a37 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s4, v0 -; GFX900-GISEL-NEXT: v_fma_f32 v0, s4, v0, -v2 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s2, v0 +; GFX900-GISEL-NEXT: v_fma_f32 v0, s2, v0, -v2 ; GFX900-GISEL-NEXT: v_rndne_f32_e32 v3, v2 -; GFX900-GISEL-NEXT: v_fma_f32 v0, s4, v1, v0 +; GFX900-GISEL-NEXT: v_fma_f32 v0, s2, v1, v0 ; GFX900-GISEL-NEXT: v_sub_f32_e32 v1, v2, v3 ; GFX900-GISEL-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX900-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v3 ; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0xc23369f4 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v2 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v2 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 ; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x421a209b ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v1 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[0:1] @@ -138,10 +136,10 @@ define amdgpu_kernel void @s_exp10_f32(ptr addrspace(1) %out, float %in) { ; ; SI-SDAG-LABEL: s_exp10_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dword s4, s[0:1], 0xb ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x33979a37 +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SI-SDAG-NEXT: v_mul_f32_e32 v2, s4, v0 @@ -166,29 +164,29 @@ define amdgpu_kernel void @s_exp10_f32(ptr addrspace(1) %out, float %in) { ; ; SI-GISEL-LABEL: s_exp10_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dword s2, s[0:1], 0xb ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x33979a37 -; SI-GISEL-NEXT: s_mov_b32 s2, -1 +; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_mul_f32_e32 v2, s4, v0 -; SI-GISEL-NEXT: v_fma_f32 v0, s4, v0, -v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v2, s2, v0 +; SI-GISEL-NEXT: v_fma_f32 v0, s2, v0, -v2 ; SI-GISEL-NEXT: v_rndne_f32_e32 v3, v2 -; SI-GISEL-NEXT: v_fma_f32 v0, s4, v1, v0 +; SI-GISEL-NEXT: v_fma_f32 v0, s2, v1, v0 ; SI-GISEL-NEXT: v_sub_f32_e32 v1, v2, v3 ; SI-GISEL-NEXT: v_add_f32_e32 v0, v1, v0 ; SI-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v3 ; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0xc23369f4 -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v2 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v2 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 ; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x421a209b ; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 +; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v1 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: s_mov_b32 s2, -1 ; SI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-GISEL-NEXT: s_endpgm ; @@ -340,7 +338,7 @@ define amdgpu_kernel void @s_exp10_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; VI-SDAG-LABEL: s_exp10_v2f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: s_and_b32 s4, s3, 0xfffff000 @@ -392,7 +390,7 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; VI-GISEL-LABEL: s_exp10_v2f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x40549000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3a2784bc ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -444,59 +442,59 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX900-SDAG-LABEL: s_exp10_v2f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x33979a37 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0xc23369f4 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s3, v0 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s7, v0 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v3, v2 -; GFX900-SDAG-NEXT: v_fma_f32 v4, s3, v0, -v2 +; GFX900-SDAG-NEXT: v_fma_f32 v4, s7, v0, -v2 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX900-SDAG-NEXT: v_fma_f32 v4, s3, v1, v4 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v6, s2, v0 +; GFX900-SDAG-NEXT: v_fma_f32 v4, s7, v1, v4 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v6, s6, v0 ; GFX900-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v7, v6 -; GFX900-SDAG-NEXT: v_fma_f32 v0, s2, v0, -v6 +; GFX900-SDAG-NEXT: v_fma_f32 v0, s6, v0, -v6 ; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v2, v2 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v8, v6, v7 -; GFX900-SDAG-NEXT: v_fma_f32 v0, s2, v1, v0 +; GFX900-SDAG-NEXT: v_fma_f32 v0, s6, v1, v0 ; GFX900-SDAG-NEXT: v_add_f32_e32 v0, v8, v0 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v6, v7 ; GFX900-SDAG-NEXT: v_ldexp_f32 v2, v2, v3 -; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s3, v5 +; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v5 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0x421a209b ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; GFX900-SDAG-NEXT: v_mov_b32_e32 v7, 0x7f800000 -; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s3, v3 +; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s7, v3 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v6 -; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v5 +; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s6, v5 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v3 +; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v3 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc -; GFX900-SDAG-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; GFX900-SDAG-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_exp10_v2f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x33979a37 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v6, 0x7f800000 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s2, v0 -; GFX900-GISEL-NEXT: v_fma_f32 v3, s2, v0, -v2 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s6, v0 +; GFX900-GISEL-NEXT: v_fma_f32 v3, s6, v0, -v2 ; GFX900-GISEL-NEXT: v_rndne_f32_e32 v4, v2 -; GFX900-GISEL-NEXT: v_mul_f32_e32 v5, s3, v0 -; GFX900-GISEL-NEXT: v_fma_f32 v3, s2, v1, v3 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v5, s7, v0 +; GFX900-GISEL-NEXT: v_fma_f32 v3, s6, v1, v3 ; GFX900-GISEL-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX900-GISEL-NEXT: v_fma_f32 v0, s3, v0, -v5 +; GFX900-GISEL-NEXT: v_fma_f32 v0, s7, v0, -v5 ; GFX900-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 -; GFX900-GISEL-NEXT: v_fma_f32 v0, s3, v1, v0 +; GFX900-GISEL-NEXT: v_fma_f32 v0, s7, v1, v0 ; GFX900-GISEL-NEXT: v_rndne_f32_e32 v1, v5 ; GFX900-GISEL-NEXT: v_cvt_i32_f32_e32 v3, v4 ; GFX900-GISEL-NEXT: v_exp_f32_e32 v2, v2 @@ -506,23 +504,23 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; GFX900-GISEL-NEXT: v_exp_f32_e32 v5, v0 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0xc23369f4 ; GFX900-GISEL-NEXT: v_ldexp_f32 v2, v2, v3 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v4 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v4 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x421a209b ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc -; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v3 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s6, v3 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc ; GFX900-GISEL-NEXT: v_ldexp_f32 v1, v5, v1 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s3, v4 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s7, v4 ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc -; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s3, v3 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s7, v3 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX900-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX900-GISEL-NEXT: s_endpgm ; ; SI-SDAG-LABEL: s_exp10_v2f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x33979a37 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 @@ -564,7 +562,7 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; SI-GISEL-LABEL: s_exp10_v2f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x33979a37 ; SI-GISEL-NEXT: v_mov_b32_e32 v6, 0x7f800000 @@ -855,25 +853,25 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in) define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; VI-SDAG-LABEL: s_exp10_v3f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549000 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: s_and_b32 s0, s6, 0xfffff000 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; VI-SDAG-NEXT: s_and_b32 s2, s6, 0xfffff000 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_sub_f32_e32 v2, s6, v2 ; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3a2784bc, v2 ; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x40549000, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v1, s0, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v1, s2, v0 ; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x3a2784bc ; VI-SDAG-NEXT: v_rndne_f32_e32 v3, v1 -; VI-SDAG-NEXT: v_mul_f32_e32 v5, s0, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v5, s2, v4 ; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v3 ; VI-SDAG-NEXT: v_add_f32_e32 v2, v5, v2 ; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 ; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1 ; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v2, v3 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: s_and_b32 s2, s5, 0xfffff000 ; VI-SDAG-NEXT: v_mov_b32_e32 v7, s2 ; VI-SDAG-NEXT: v_sub_f32_e32 v7, s5, v7 @@ -919,7 +917,6 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v3 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v5 -; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v4, s1 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s0 @@ -928,19 +925,19 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; VI-GISEL-LABEL: s_exp10_v3f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40549000 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x3a2784bc +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: s_and_b32 s0, s4, 0xfffff000 -; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: s_and_b32 s2, s4, 0xfffff000 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_sub_f32_e32 v0, s4, v0 ; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3a2784bc, v0 ; VI-GISEL-NEXT: v_mul_f32_e32 v0, 0x40549000, v0 -; VI-GISEL-NEXT: v_mul_f32_e32 v3, s0, v1 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, s2, v1 ; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v4 -; VI-GISEL-NEXT: v_mul_f32_e32 v4, s0, v2 -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: v_mul_f32_e32 v4, s2, v2 ; VI-GISEL-NEXT: s_and_b32 s2, s5, 0xfffff000 ; VI-GISEL-NEXT: v_mov_b32_e32 v5, s2 ; VI-GISEL-NEXT: v_sub_f32_e32 v5, s5, v5 @@ -992,7 +989,6 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v3 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc ; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s6, v4 -; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v4, s1 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s0 @@ -1001,11 +997,11 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; GFX900-SDAG-LABEL: s_exp10_v3f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x33979a37 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0x421a209b +; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-SDAG-NEXT: v_mul_f32_e32 v6, s5, v0 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v7, v6 @@ -1054,10 +1050,10 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; GFX900-GISEL-LABEL: s_exp10_v3f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x40549a78 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x33979a37 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-GISEL-NEXT: v_mul_f32_e32 v5, s5, v1 ; GFX900-GISEL-NEXT: v_fma_f32 v6, s5, v1, -v5 @@ -1107,10 +1103,10 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; SI-SDAG-LABEL: s_exp10_v3f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x33979a37 +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SI-SDAG-NEXT: v_mul_f32_e32 v5, s4, v0 @@ -1162,10 +1158,10 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; SI-GISEL-LABEL: s_exp10_v3f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40549a78 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x33979a37 +; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-GISEL-NEXT: s_mov_b32 s2, -1 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: v_mul_f32_e32 v5, s5, v1 @@ -1596,26 +1592,26 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in) define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; VI-SDAG-LABEL: s_exp10_v4f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549000 ; VI-SDAG-NEXT: v_mov_b32_e32 v6, 0x421a209b +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: s_and_b32 s0, s7, 0xfffff000 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; VI-SDAG-NEXT: s_and_b32 s2, s7, 0xfffff000 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_sub_f32_e32 v2, s7, v2 ; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3a2784bc, v2 ; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x40549000, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v1, s0, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v1, s2, v0 ; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x3a2784bc ; VI-SDAG-NEXT: v_rndne_f32_e32 v3, v1 -; VI-SDAG-NEXT: v_mul_f32_e32 v5, s0, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v5, s2, v4 ; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v3 ; VI-SDAG-NEXT: v_add_f32_e32 v2, v5, v2 ; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 ; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1 ; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v2, v3 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: s_and_b32 s2, s6, 0xfffff000 ; VI-SDAG-NEXT: v_mov_b32_e32 v7, s2 ; VI-SDAG-NEXT: v_sub_f32_e32 v7, s6, v7 @@ -1679,7 +1675,6 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v5 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v6 -; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v5, s1 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc ; VI-SDAG-NEXT: v_mov_b32_e32 v4, s0 @@ -1688,28 +1683,29 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; VI-GISEL-LABEL: s_exp10_v4f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x40549000 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3a2784bc ; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x421a209b +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: s_and_b32 s0, s4, 0xfffff000 -; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: s_and_b32 s2, s4, 0xfffff000 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_sub_f32_e32 v0, s4, v0 ; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3a2784bc, v0 ; VI-GISEL-NEXT: v_mul_f32_e32 v0, 0x40549000, v0 -; VI-GISEL-NEXT: v_mul_f32_e32 v1, s0, v2 +; VI-GISEL-NEXT: v_mul_f32_e32 v1, s2, v2 ; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v4 -; VI-GISEL-NEXT: v_mul_f32_e32 v4, s0, v3 +; VI-GISEL-NEXT: v_mul_f32_e32 v4, s2, v3 ; VI-GISEL-NEXT: v_add_f32_e32 v0, v4, v0 ; VI-GISEL-NEXT: v_rndne_f32_e32 v4, v1 ; VI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v4 ; VI-GISEL-NEXT: v_add_f32_e32 v0, v1, v0 ; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v4 ; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: s_and_b32 s2, s5, 0xfffff000 ; VI-GISEL-NEXT: v_mul_f32_e32 v6, s2, v2 +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xc23369f4 ; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; VI-GISEL-NEXT: v_sub_f32_e32 v1, s5, v1 @@ -1725,7 +1721,7 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; VI-GISEL-NEXT: v_exp_f32_e32 v1, v1 ; VI-GISEL-NEXT: s_and_b32 s2, s6, 0xfffff000 ; VI-GISEL-NEXT: v_mul_f32_e32 v8, s2, v2 -; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xc23369f4 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v4 ; VI-GISEL-NEXT: v_ldexp_f32 v1, v1, v6 ; VI-GISEL-NEXT: v_mov_b32_e32 v6, s2 ; VI-GISEL-NEXT: v_sub_f32_e32 v6, s6, v6 @@ -1750,7 +1746,6 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; VI-GISEL-NEXT: v_add_f32_e32 v8, v8, v9 ; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v8 ; VI-GISEL-NEXT: v_rndne_f32_e32 v8, v2 -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v4 ; VI-GISEL-NEXT: v_sub_f32_e32 v2, v2, v8 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc ; VI-GISEL-NEXT: v_mov_b32_e32 v7, 0x7f800000 @@ -1771,7 +1766,6 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s7, v4 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc ; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s7, v5 -; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v5, s1 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc ; VI-GISEL-NEXT: v_mov_b32_e32 v4, s0 @@ -1780,11 +1774,11 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; GFX900-SDAG-LABEL: s_exp10_v4f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x33979a37 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0xc23369f4 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v6, 0x421a209b ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s7, v0 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v3, v2 @@ -1795,8 +1789,8 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v2, v2 ; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v5 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v6, 0x421a209b ; GFX900-SDAG-NEXT: v_mov_b32_e32 v9, 0x7f800000 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_ldexp_f32 v2, v2, v3 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, s6, v0 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v7, v3 @@ -1841,16 +1835,17 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v6 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc +; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_exp10_v4f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x40549a78 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x33979a37 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v5, 0x421a209b +; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s4, v2 ; GFX900-GISEL-NEXT: v_fma_f32 v1, s4, v2, -v0 @@ -1912,11 +1907,11 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; SI-SDAG-LABEL: s_exp10_v4f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x33979a37 ; SI-SDAG-NEXT: v_mov_b32_e32 v5, 0x421a209b +; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0x7f800000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SI-SDAG-NEXT: v_mul_f32_e32 v2, s7, v0 ; SI-SDAG-NEXT: v_rndne_f32_e32 v3, v2 @@ -1928,7 +1923,7 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3 ; SI-SDAG-NEXT: v_mov_b32_e32 v4, 0xc23369f4 ; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v4 -; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0x7f800000 +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-SDAG-NEXT: v_ldexp_f32_e32 v2, v2, v3 ; SI-SDAG-NEXT: v_mul_f32_e32 v3, s6, v0 ; SI-SDAG-NEXT: v_rndne_f32_e32 v6, v3 @@ -1974,16 +1969,17 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s2, -1 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-SDAG-NEXT: s_endpgm ; ; SI-GISEL-LABEL: s_exp10_v4f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x40549a78 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x33979a37 ; SI-GISEL-NEXT: v_mov_b32_e32 v5, 0x421a209b +; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: v_mul_f32_e32 v0, s4, v2 ; SI-GISEL-NEXT: v_fma_f32 v1, s4, v2, -v0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll index 9f80e66e8f873..6cca705f7b1db 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll @@ -12,17 +12,17 @@ define amdgpu_kernel void @s_exp2_f32(ptr addrspace(1) %out, float %in) { ; SI-SDAG-LABEL: s_exp2_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; SI-SDAG-NEXT: v_add_f32_e32 v1, s4, v1 +; SI-SDAG-NEXT: v_add_f32_e32 v1, s2, v1 ; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1 ; SI-SDAG-NEXT: s_mov_b32 s2, -1 ; SI-SDAG-NEXT: v_mul_f32_e32 v0, v1, v0 @@ -31,35 +31,35 @@ define amdgpu_kernel void @s_exp2_f32(ptr addrspace(1) %out, float %in) { ; ; SI-GISEL-LABEL: s_exp2_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000 -; SI-GISEL-NEXT: s_mov_b32 s2, -1 +; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; SI-GISEL-NEXT: v_add_f32_e32 v0, s4, v0 +; SI-GISEL-NEXT: v_add_f32_e32 v0, s2, v0 ; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: s_mov_b32 s2, -1 ; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; SI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: s_exp2_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; VI-SDAG-NEXT: v_add_f32_e32 v1, s4, v1 +; VI-SDAG-NEXT: v_add_f32_e32 v1, s2, v1 ; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1 ; VI-SDAG-NEXT: v_mul_f32_e32 v2, v1, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -69,14 +69,14 @@ define amdgpu_kernel void @s_exp2_f32(ptr addrspace(1) %out, float %in) { ; ; VI-GISEL-LABEL: s_exp2_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; VI-GISEL-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-GISEL-NEXT: v_add_f32_e32 v0, s2, v0 ; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc @@ -88,8 +88,8 @@ define amdgpu_kernel void @s_exp2_f32(ptr addrspace(1) %out, float %in) { ; ; GFX900-SDAG-LABEL: s_exp2_f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0 @@ -101,25 +101,24 @@ define amdgpu_kernel void @s_exp2_f32(ptr addrspace(1) %out, float %in) { ; GFX900-SDAG-NEXT: v_add_f32_e32 v1, s4, v1 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v1, v1 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, v1, v0 -; GFX900-SDAG-NEXT: global_store_dword v2, v0, s[0:1] +; GFX900-SDAG-NEXT: global_store_dword v2, v0, s[2:3] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_exp2_f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX900-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; GFX900-GISEL-NEXT: v_add_f32_e32 v0, s0, v0 +; GFX900-GISEL-NEXT: v_add_f32_e32 v0, s2, v0 ; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc ; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX900-GISEL-NEXT: s_endpgm ; @@ -173,7 +172,7 @@ define amdgpu_kernel void @s_exp2_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; SI-SDAG-LABEL: s_exp2_v2f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000 @@ -199,7 +198,7 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; SI-GISEL-LABEL: s_exp2_v2f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x1f800000 @@ -223,7 +222,7 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; VI-SDAG-LABEL: s_exp2_v2f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 @@ -247,7 +246,7 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; VI-GISEL-LABEL: s_exp2_v2f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x1f800000 @@ -271,30 +270,30 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX900-SDAG-LABEL: s_exp2_v2f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v3, 1.0, v1, vcc ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v2, vcc -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc -; GFX900-SDAG-NEXT: v_add_f32_e32 v4, s3, v4 -; GFX900-SDAG-NEXT: v_add_f32_e32 v1, s2, v1 +; GFX900-SDAG-NEXT: v_add_f32_e32 v4, s7, v4 +; GFX900-SDAG-NEXT: v_add_f32_e32 v1, s6, v1 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v4, v4 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v2, v1 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, v4, v3 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, v2, v0 -; GFX900-SDAG-NEXT: global_store_dwordx2 v5, v[0:1], s[0:1] +; GFX900-SDAG-NEXT: global_store_dwordx2 v5, v[0:1], s[4:5] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_exp2_v2f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x1f800000 @@ -381,8 +380,8 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in) define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; SI-SDAG-LABEL: s_exp2_v3f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000 @@ -413,8 +412,8 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; SI-GISEL-LABEL: s_exp2_v3f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x1f800000 @@ -445,8 +444,8 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; VI-SDAG-LABEL: s_exp2_v3f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000 @@ -476,11 +475,11 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; VI-GISEL-LABEL: s_exp2_v3f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x1f800000 -; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc @@ -507,8 +506,8 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; GFX900-SDAG-LABEL: s_exp2_v3f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000 @@ -532,16 +531,16 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, v4, v2 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, v6, v5 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, v3, v0 -; GFX900-SDAG-NEXT: global_store_dwordx3 v7, v[0:2], s[0:1] +; GFX900-SDAG-NEXT: global_store_dwordx3 v7, v[0:2], s[2:3] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_exp2_v3f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x1f800000 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc @@ -656,45 +655,45 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in) define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; SI-SDAG-LABEL: s_exp2_v4f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000 -; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v5, 1.0, v1, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v3, vcc -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s1, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v7, 1.0, v1, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v8, 0, v3, vcc -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc -; SI-SDAG-NEXT: v_add_f32_e32 v4, s7, v4 -; SI-SDAG-NEXT: v_add_f32_e32 v6, s6, v6 -; SI-SDAG-NEXT: v_add_f32_e32 v8, s5, v8 -; SI-SDAG-NEXT: v_add_f32_e32 v1, s4, v1 +; SI-SDAG-NEXT: v_add_f32_e32 v4, s3, v4 +; SI-SDAG-NEXT: v_add_f32_e32 v6, s2, v6 +; SI-SDAG-NEXT: v_add_f32_e32 v8, s1, v8 +; SI-SDAG-NEXT: v_add_f32_e32 v1, s0, v1 ; SI-SDAG-NEXT: v_exp_f32_e32 v4, v4 ; SI-SDAG-NEXT: v_exp_f32_e32 v6, v6 ; SI-SDAG-NEXT: v_exp_f32_e32 v8, v8 ; SI-SDAG-NEXT: v_exp_f32_e32 v9, v1 -; SI-SDAG-NEXT: s_mov_b32 s2, -1 +; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: v_mul_f32_e32 v3, v4, v2 ; SI-SDAG-NEXT: v_mul_f32_e32 v2, v6, v5 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, v8, v7 ; SI-SDAG-NEXT: v_mul_f32_e32 v0, v9, v0 -; SI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-SDAG-NEXT: s_endpgm ; ; SI-GISEL-LABEL: s_exp2_v4f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0xc2fc0000 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x42800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x1f800000 @@ -730,8 +729,8 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; VI-SDAG-LABEL: s_exp2_v4f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000 @@ -767,8 +766,8 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; VI-GISEL-LABEL: s_exp2_v4f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0xc2fc0000 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x42800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x1f800000 @@ -804,8 +803,8 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; GFX900-SDAG-LABEL: s_exp2_v4f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000 @@ -835,13 +834,13 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, v7, v6 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, v9, v8 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, v10, v0 -; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_exp2_v4f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0xc2fc0000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x42800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x1f800000 @@ -871,7 +870,7 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, v5, v2 ; GFX900-GISEL-NEXT: v_mul_f32_e32 v3, v3, v4 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] +; GFX900-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX900-GISEL-NEXT: s_endpgm ; ; R600-LABEL: s_exp2_v4f32: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll index c2f6fbfe4667c..d847af780acab 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll @@ -14,17 +14,17 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; SI-SDAG-LABEL: s_log_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s0, 0x3f317217 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; SI-SDAG-NEXT: s_mov_b32 s0, 0x3f317217 +; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 ; SI-SDAG-NEXT: v_fma_f32 v2, v0, s0, -v1 @@ -42,15 +42,15 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; ; SI-GISEL-LABEL: s_log_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x3377d1cf ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; SI-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 ; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x7f800000 @@ -70,15 +70,15 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; ; VI-SDAG-LABEL: s_log_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s0, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 +; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-SDAG-NEXT: s_mov_b32 s0, 0x7f800000 ; VI-SDAG-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 ; VI-SDAG-NEXT: v_sub_f32_e32 v2, v0, v1 @@ -94,6 +94,7 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x41b17218 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; VI-SDAG-NEXT: v_sub_f32_e32 v2, v0, v1 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 @@ -101,15 +102,15 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; ; VI-GISEL-LABEL: s_log_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s0, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; VI-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 +; VI-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 ; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 ; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1 ; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v1 @@ -125,6 +126,7 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x41b17218 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 @@ -132,17 +134,17 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; ; GFX900-SDAG-LABEL: s_log_f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dword s0, s[2:3], 0x2c -; GFX900-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; GFX900-SDAG-NEXT: s_mov_b32 s1, 0x3377d1cf +; GFX900-SDAG-NEXT: s_mov_b32 s0, 0x3f317217 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, s4, v0 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 -; GFX900-SDAG-NEXT: s_mov_b32 s0, 0x3f317217 +; GFX900-SDAG-NEXT: s_mov_b32 s1, 0x3377d1cf ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317217, v0 ; GFX900-SDAG-NEXT: v_fma_f32 v3, v0, s0, -v2 @@ -154,20 +156,20 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x41b17218 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v2 -; GFX900-SDAG-NEXT: global_store_dword v1, v0, s[4:5] +; GFX900-SDAG-NEXT: global_store_dword v1, v0, s[2:3] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_log_f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dword s0, s[2:3], 0x2c -; GFX900-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x3377d1cf ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s4, v0 ; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x7f800000 @@ -181,18 +183,19 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX900-GISEL-NEXT: s_endpgm ; ; GFX1100-SDAG-LABEL: s_log_f32: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x2c +; GFX1100-SDAG-NEXT: s_clause 0x1 +; GFX1100-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s0 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s2 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4 -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s3 +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff @@ -204,9 +207,8 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; GFX1100-SDAG-NEXT: v_dual_add_f32 v1, v1, v2 :: v_dual_mov_b32 v2, 0 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, s4 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, s3 ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] ; GFX1100-SDAG-NEXT: s_nop 0 ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -214,13 +216,14 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; ; GFX1100-GISEL-LABEL: s_log_f32: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x2c +; GFX1100-GISEL-NEXT: s_clause 0x1 +; GFX1100-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s0 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s2 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4 -; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s3 +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff @@ -230,11 +233,10 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3377d1cf, v0 ; GFX1100-GISEL-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 0x41b17218, s4 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 0x41b17218, s3 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_dual_cndmask_b32 v0, v0, v1 :: v_dual_mov_b32 v1, 0 ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v2 -; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX1100-GISEL-NEXT: s_nop 0 ; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -316,7 +318,7 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; SI-SDAG-LABEL: s_log_v2f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-SDAG-NEXT: s_mov_b32 s8, 0x3377d1cf @@ -357,7 +359,7 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; SI-GISEL-LABEL: s_log_v2f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3f317217 @@ -396,7 +398,7 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; VI-SDAG-LABEL: s_log_v2f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-SDAG-NEXT: s_mov_b32 s2, 0x7f800000 @@ -443,7 +445,7 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; VI-GISEL-LABEL: s_log_v2f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x7f800000 @@ -490,7 +492,7 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; GFX900-SDAG-LABEL: s_log_v2f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-SDAG-NEXT: s_mov_b32 s2, 0x3f317217 @@ -528,7 +530,7 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; GFX900-GISEL-LABEL: s_log_v2f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3f317217 @@ -566,7 +568,7 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; GFX1100-SDAG-LABEL: s_log_v2f32: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s3 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s5, 0x800000, s2 @@ -601,7 +603,7 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; GFX1100-GISEL-LABEL: s_log_v2f32: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s2 ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s5, 0x800000, s3 @@ -747,8 +749,8 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; SI-SDAG-LABEL: s_log_v3f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd -; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd +; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -800,8 +802,8 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; ; SI-GISEL-LABEL: s_log_v3f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd +; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3f317217 @@ -853,7 +855,7 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; ; VI-SDAG-LABEL: s_log_v3f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-SDAG-NEXT: s_mov_b32 s8, 0x7f800000 @@ -862,7 +864,7 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc ; VI-SDAG-NEXT: v_mul_f32_e32 v2, s6, v2 ; VI-SDAG-NEXT: v_log_f32_e32 v2, v2 -; VI-SDAG-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_and_b32_e32 v3, 0xfffff000, v2 ; VI-SDAG-NEXT: v_sub_f32_e32 v4, v2, v3 ; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3805fdf4, v3 @@ -919,8 +921,8 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; ; VI-GISEL-LABEL: s_log_v3f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -984,8 +986,8 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; ; GFX900-SDAG-LABEL: s_log_v3f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1035,8 +1037,8 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; ; GFX900-GISEL-LABEL: s_log_v3f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3f317217 @@ -1087,19 +1089,19 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; GFX1100-SDAG-LABEL: s_log_v3f32: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s6 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s4 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s6 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s4 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s7 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s8 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s9 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s7 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 0x41b17218, s8 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s7 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s2 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 0x41b17218, s3 ; GFX1100-SDAG-NEXT: v_dual_mul_f32 v0, s6, v0 :: v_dual_mul_f32 v1, s5, v1 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 @@ -1120,7 +1122,7 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_fmac_f32_e32 v7, 0x3377d1cf, v1 ; GFX1100-SDAG-NEXT: v_add_f32_e32 v3, v3, v6 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 0x41b17218, s9 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 0x41b17218, s7 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_add_f32_e32 v4, v4, v7 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo @@ -1143,19 +1145,19 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; GFX1100-GISEL-LABEL: s_log_v3f32: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s4 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s6 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s4 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s6 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s7 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s8 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s9 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 0x41b17218, s8 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s7 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s7 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 0x41b17218, s3 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s2 ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 @@ -1176,7 +1178,7 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v7, 0x3377d1cf, v1 ; GFX1100-GISEL-NEXT: v_add_f32_e32 v3, v3, v6 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 0x41b17218, s9 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 0x41b17218, s7 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-GISEL-NEXT: v_add_f32_e32 v4, v4, v7 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo @@ -1353,8 +1355,8 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; SI-SDAG-LABEL: s_log_v4f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd -; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd +; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-SDAG-NEXT: s_mov_b32 s12, 0x3377d1cf @@ -1417,8 +1419,8 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; ; SI-GISEL-LABEL: s_log_v4f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd +; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x3f317217 @@ -1481,8 +1483,8 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; ; VI-SDAG-LABEL: s_log_v4f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-SDAG-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-SDAG-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1563,8 +1565,8 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; ; VI-GISEL-LABEL: s_log_v4f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1645,8 +1647,8 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; ; GFX900-SDAG-LABEL: s_log_v4f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-SDAG-NEXT: s_mov_b32 s10, 0x3377d1cf @@ -1708,8 +1710,8 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; ; GFX900-GISEL-LABEL: s_log_v4f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x3f317217 @@ -1772,32 +1774,32 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; GFX1100-SDAG-LABEL: s_log_v4f32: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s7 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s6 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s10, 0x800000, s5 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s11, 0x800000, s4 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s7 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s6 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s4 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s8 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s9 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s10 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s11 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x41b17218, s8 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s8 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s9 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x41b17218, s2 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_dual_mul_f32 v0, s7, v0 :: v_dual_mul_f32 v1, s6, v1 ; GFX1100-SDAG-NEXT: v_dual_mul_f32 v2, s5, v2 :: v_dual_mul_f32 v3, s4, v3 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s9 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s3 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(TRANS32_DEP_3) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, v2 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v3, v3 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v14, 0, 0x41b17218, s10 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v15, 0, 0x41b17218, s11 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v14, 0, 0x41b17218, s8 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v15, 0, 0x41b17218, s9 ; GFX1100-SDAG-NEXT: v_dual_mul_f32 v5, 0x3f317217, v0 :: v_dual_mul_f32 v6, 0x3f317217, v1 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_dual_mul_f32 v7, 0x3f317217, v2 :: v_dual_mul_f32 v8, 0x3f317217, v3 @@ -1833,32 +1835,32 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; GFX1100-GISEL-LABEL: s_log_v4f32: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s4 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s5 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s10, 0x800000, s6 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s11, 0x800000, s7 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s4 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s6 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s7 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s8 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s9 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s10 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s11 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x41b17218, s8 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s8 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s9 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x41b17218, s2 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1 ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v2, s6, v2 :: v_dual_mul_f32 v3, s7, v3 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s9 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s3 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(TRANS32_DEP_3) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v2, v2 ; GFX1100-GISEL-NEXT: v_log_f32_e32 v3, v3 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 0x41b17218, s10 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 0x41b17218, s11 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 0x41b17218, s8 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 0x41b17218, s9 ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v5, 0x3f317217, v0 :: v_dual_mul_f32 v6, 0x3f317217, v1 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v7, 0x3f317217, v2 :: v_dual_mul_f32 v8, 0x3f317217, v3 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll index 0a1f7ab6fc0ae..3f060de9f6596 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll @@ -14,17 +14,17 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; SI-SDAG-LABEL: s_log10_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s0, 0x3e9a209a ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; SI-SDAG-NEXT: s_mov_b32 s0, 0x3e9a209a +; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 ; SI-SDAG-NEXT: v_fma_f32 v2, v0, s0, -v1 @@ -42,15 +42,15 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; ; SI-GISEL-LABEL: s_log10_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x3284fbcf ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; SI-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 ; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x7f800000 @@ -70,15 +70,15 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; ; VI-SDAG-LABEL: s_log10_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s0, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 +; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-SDAG-NEXT: s_mov_b32 s0, 0x7f800000 ; VI-SDAG-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 ; VI-SDAG-NEXT: v_sub_f32_e32 v2, v0, v1 @@ -94,6 +94,7 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x411a209b ; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; VI-SDAG-NEXT: v_sub_f32_e32 v2, v0, v1 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 @@ -101,15 +102,15 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; ; VI-GISEL-LABEL: s_log10_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s0, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; VI-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 +; VI-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 ; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 ; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1 ; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v1 @@ -125,6 +126,7 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x411a209b ; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 @@ -132,17 +134,17 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; ; GFX900-SDAG-LABEL: s_log10_f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dword s0, s[2:3], 0x2c -; GFX900-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; GFX900-SDAG-NEXT: s_mov_b32 s1, 0x3284fbcf +; GFX900-SDAG-NEXT: s_mov_b32 s0, 0x3e9a209a ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, s4, v0 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 -; GFX900-SDAG-NEXT: s_mov_b32 s0, 0x3e9a209a +; GFX900-SDAG-NEXT: s_mov_b32 s1, 0x3284fbcf ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209a, v0 ; GFX900-SDAG-NEXT: v_fma_f32 v3, v0, s0, -v2 @@ -154,20 +156,20 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x411a209b ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v2 -; GFX900-SDAG-NEXT: global_store_dword v1, v0, s[4:5] +; GFX900-SDAG-NEXT: global_store_dword v1, v0, s[2:3] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_log10_f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dword s0, s[2:3], 0x2c -; GFX900-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x3284fbcf ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s4, v0 ; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x7f800000 @@ -181,18 +183,19 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX900-GISEL-NEXT: s_endpgm ; ; GFX1100-SDAG-LABEL: s_log10_f32: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x2c +; GFX1100-SDAG-NEXT: s_clause 0x1 +; GFX1100-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s0 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s2 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4 -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s3 +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff @@ -204,9 +207,8 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; GFX1100-SDAG-NEXT: v_dual_add_f32 v1, v1, v2 :: v_dual_mov_b32 v2, 0 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, s4 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, s3 ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] ; GFX1100-SDAG-NEXT: s_nop 0 ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -214,13 +216,14 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; ; GFX1100-GISEL-LABEL: s_log10_f32: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x2c +; GFX1100-GISEL-NEXT: s_clause 0x1 +; GFX1100-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s0 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s2 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4 -; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s3 +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff @@ -230,11 +233,10 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3284fbcf, v0 ; GFX1100-GISEL-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 0x411a209b, s4 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 0x411a209b, s3 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_dual_cndmask_b32 v0, v0, v1 :: v_dual_mov_b32 v1, 0 ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v2 -; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX1100-GISEL-NEXT: s_nop 0 ; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -316,7 +318,7 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; SI-SDAG-LABEL: s_log10_v2f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-SDAG-NEXT: s_mov_b32 s8, 0x3284fbcf @@ -357,7 +359,7 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; SI-GISEL-LABEL: s_log10_v2f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3e9a209a @@ -396,7 +398,7 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; VI-SDAG-LABEL: s_log10_v2f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-SDAG-NEXT: s_mov_b32 s2, 0x7f800000 @@ -443,7 +445,7 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; VI-GISEL-LABEL: s_log10_v2f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x7f800000 @@ -490,7 +492,7 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX900-SDAG-LABEL: s_log10_v2f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-SDAG-NEXT: s_mov_b32 s2, 0x3e9a209a @@ -528,7 +530,7 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX900-GISEL-LABEL: s_log10_v2f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3e9a209a @@ -566,7 +568,7 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX1100-SDAG-LABEL: s_log10_v2f32: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s3 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s5, 0x800000, s2 @@ -601,7 +603,7 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX1100-GISEL-LABEL: s_log10_v2f32: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s2 ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s5, 0x800000, s3 @@ -747,8 +749,8 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; SI-SDAG-LABEL: s_log10_v3f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd -; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd +; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -800,8 +802,8 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; SI-GISEL-LABEL: s_log10_v3f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd +; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3e9a209a @@ -853,7 +855,7 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; VI-SDAG-LABEL: s_log10_v3f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-SDAG-NEXT: s_mov_b32 s8, 0x7f800000 @@ -862,7 +864,7 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc ; VI-SDAG-NEXT: v_mul_f32_e32 v2, s6, v2 ; VI-SDAG-NEXT: v_log_f32_e32 v2, v2 -; VI-SDAG-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_and_b32_e32 v3, 0xfffff000, v2 ; VI-SDAG-NEXT: v_sub_f32_e32 v4, v2, v3 ; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x369a84fb, v3 @@ -919,8 +921,8 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; VI-GISEL-LABEL: s_log10_v3f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -984,8 +986,8 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; GFX900-SDAG-LABEL: s_log10_v3f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1035,8 +1037,8 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; GFX900-GISEL-LABEL: s_log10_v3f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3e9a209a @@ -1087,19 +1089,19 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX1100-SDAG-LABEL: s_log10_v3f32: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s6 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s4 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s6 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s4 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s7 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s8 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s9 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s7 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 0x411a209b, s8 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s7 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s2 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 0x411a209b, s3 ; GFX1100-SDAG-NEXT: v_dual_mul_f32 v0, s6, v0 :: v_dual_mul_f32 v1, s5, v1 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 @@ -1120,7 +1122,7 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_fmac_f32_e32 v7, 0x3284fbcf, v1 ; GFX1100-SDAG-NEXT: v_add_f32_e32 v3, v3, v6 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 0x411a209b, s9 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 0x411a209b, s7 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_add_f32_e32 v4, v4, v7 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo @@ -1143,19 +1145,19 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX1100-GISEL-LABEL: s_log10_v3f32: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s4 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s6 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s4 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s6 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s7 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s8 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s9 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 0x411a209b, s8 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s7 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s7 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 0x411a209b, s3 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s2 ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 @@ -1176,7 +1178,7 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v7, 0x3284fbcf, v1 ; GFX1100-GISEL-NEXT: v_add_f32_e32 v3, v3, v6 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 0x411a209b, s9 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 0x411a209b, s7 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-GISEL-NEXT: v_add_f32_e32 v4, v4, v7 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo @@ -1353,8 +1355,8 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; SI-SDAG-LABEL: s_log10_v4f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd -; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd +; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-SDAG-NEXT: s_mov_b32 s12, 0x3284fbcf @@ -1417,8 +1419,8 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; SI-GISEL-LABEL: s_log10_v4f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd +; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x3e9a209a @@ -1481,8 +1483,8 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; VI-SDAG-LABEL: s_log10_v4f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-SDAG-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-SDAG-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1563,8 +1565,8 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; VI-GISEL-LABEL: s_log10_v4f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1645,8 +1647,8 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; GFX900-SDAG-LABEL: s_log10_v4f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-SDAG-NEXT: s_mov_b32 s10, 0x3284fbcf @@ -1708,8 +1710,8 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; GFX900-GISEL-LABEL: s_log10_v4f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x3e9a209a @@ -1772,32 +1774,32 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX1100-SDAG-LABEL: s_log10_v4f32: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s7 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s6 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s10, 0x800000, s5 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s11, 0x800000, s4 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s7 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s6 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s4 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s8 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s9 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s10 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s11 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x411a209b, s8 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s8 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s9 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x411a209b, s2 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_dual_mul_f32 v0, s7, v0 :: v_dual_mul_f32 v1, s6, v1 ; GFX1100-SDAG-NEXT: v_dual_mul_f32 v2, s5, v2 :: v_dual_mul_f32 v3, s4, v3 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s9 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s3 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(TRANS32_DEP_3) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, v2 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v3, v3 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v14, 0, 0x411a209b, s10 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v15, 0, 0x411a209b, s11 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v14, 0, 0x411a209b, s8 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v15, 0, 0x411a209b, s9 ; GFX1100-SDAG-NEXT: v_dual_mul_f32 v5, 0x3e9a209a, v0 :: v_dual_mul_f32 v6, 0x3e9a209a, v1 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_dual_mul_f32 v7, 0x3e9a209a, v2 :: v_dual_mul_f32 v8, 0x3e9a209a, v3 @@ -1833,32 +1835,32 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX1100-GISEL-LABEL: s_log10_v4f32: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s4 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s5 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s10, 0x800000, s6 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s11, 0x800000, s7 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s4 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s6 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s7 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s8 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s9 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s10 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s11 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x411a209b, s8 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s8 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s9 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x411a209b, s2 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1 ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v2, s6, v2 :: v_dual_mul_f32 v3, s7, v3 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s9 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s3 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(TRANS32_DEP_3) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v2, v2 ; GFX1100-GISEL-NEXT: v_log_f32_e32 v3, v3 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 0x411a209b, s10 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 0x411a209b, s11 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 0x411a209b, s8 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 0x411a209b, s9 ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v5, 0x3e9a209a, v0 :: v_dual_mul_f32 v6, 0x3e9a209a, v1 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v7, 0x3e9a209a, v2 :: v_dual_mul_f32 v8, 0x3e9a209a, v3 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll index 7ca04cc235605..90a15ae8d9b28 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll @@ -14,17 +14,17 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) { ; SI-SDAG-LABEL: s_log2_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v1, s4, v1 +; SI-SDAG-NEXT: v_mul_f32_e32 v1, s2, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 ; SI-SDAG-NEXT: s_mov_b32 s2, -1 ; SI-SDAG-NEXT: v_sub_f32_e32 v0, v1, v0 @@ -33,35 +33,35 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) { ; ; SI-GISEL-LABEL: s_log2_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; SI-GISEL-NEXT: s_mov_b32 s2, -1 +; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; SI-GISEL-NEXT: v_mul_f32_e32 v0, s4, v0 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 ; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: s_mov_b32 s2, -1 ; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 ; SI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: s_log2_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v1, s4, v1 +; VI-SDAG-NEXT: v_mul_f32_e32 v1, s2, v1 ; VI-SDAG-NEXT: v_log_f32_e32 v1, v1 ; VI-SDAG-NEXT: v_sub_f32_e32 v2, v1, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -71,14 +71,14 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) { ; ; VI-GISEL-LABEL: s_log2_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; VI-GISEL-NEXT: v_mul_f32_e32 v0, s4, v0 +; VI-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 ; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -90,8 +90,8 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) { ; ; GFX900-SDAG-LABEL: s_log2_f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0 @@ -103,44 +103,43 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) { ; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, s4, v1 ; GFX900-SDAG-NEXT: v_log_f32_e32 v1, v1 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v1, v0 -; GFX900-SDAG-NEXT: global_store_dword v2, v0, s[0:1] +; GFX900-SDAG-NEXT: global_store_dword v2, v0, s[2:3] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_log2_f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX900-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 ; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX900-GISEL-NEXT: s_endpgm ; ; GFX1100-SDAG-LABEL: s_log2_f32: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX1100-SDAG-NEXT: s_clause 0x1 +; GFX1100-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, s4 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s0 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, s4, v1 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s3 +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, s2, v1 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v1, v0 -; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] ; GFX1100-SDAG-NEXT: s_nop 0 ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -148,19 +147,19 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) { ; ; GFX1100-GISEL-LABEL: s_log2_f32: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x2c +; GFX1100-GISEL-NEXT: s_clause 0x1 +; GFX1100-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s0 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s2 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s4 -; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s3 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s3 +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_dual_sub_f32 v0, v0, v1 :: v_dual_mov_b32 v1, 0 -; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX1100-GISEL-NEXT: s_nop 0 ; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -216,7 +215,7 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; SI-SDAG-LABEL: s_log2_v2f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000 @@ -242,7 +241,7 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; SI-GISEL-LABEL: s_log2_v2f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42000000 @@ -266,7 +265,7 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; VI-SDAG-LABEL: s_log2_v2f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 @@ -290,7 +289,7 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; VI-GISEL-LABEL: s_log2_v2f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42000000 @@ -314,30 +313,30 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX900-SDAG-LABEL: s_log2_v2f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v1, vcc ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v4, 1.0, v2, vcc -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e32 v4, s3, v4 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, s2, v1 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v4, s7, v4 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, s6, v1 ; GFX900-SDAG-NEXT: v_log_f32_e32 v4, v4 ; GFX900-SDAG-NEXT: v_log_f32_e32 v2, v1 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v4, v3 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v2, v0 -; GFX900-SDAG-NEXT: global_store_dwordx2 v5, v[0:1], s[0:1] +; GFX900-SDAG-NEXT: global_store_dwordx2 v5, v[0:1], s[4:5] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_log2_v2f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x42000000 @@ -360,7 +359,7 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX1100-SDAG-LABEL: s_log2_v2f32: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s3 @@ -385,7 +384,7 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX1100-GISEL-LABEL: s_log2_v2f32: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s2 ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s5, 0x800000, s3 @@ -473,8 +472,8 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; SI-SDAG-LABEL: s_log2_v3f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000 @@ -505,8 +504,8 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; SI-GISEL-LABEL: s_log2_v3f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x42000000 @@ -537,8 +536,8 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; VI-SDAG-LABEL: s_log2_v3f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000 @@ -568,11 +567,11 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; VI-GISEL-LABEL: s_log2_v3f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x42000000 -; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc @@ -599,8 +598,8 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; GFX900-SDAG-LABEL: s_log2_v3f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000 @@ -624,16 +623,16 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v4, v2 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v6, v5 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v3, v0 -; GFX900-SDAG-NEXT: global_store_dwordx3 v7, v[0:2], s[0:1] +; GFX900-SDAG-NEXT: global_store_dwordx3 v7, v[0:2], s[2:3] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_log2_v3f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x42000000 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc @@ -659,32 +658,32 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; GFX1100-SDAG-LABEL: s_log2_v3f32: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX1100-SDAG-NEXT: s_clause 0x1 +; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, s6 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s1, 0x800000, s5 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s6 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s4 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 1.0, 0x4f800000, s1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s2 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 1.0, 0x4f800000, s3 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v5, 1.0, 0x4f800000, s7 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s1 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1100-SDAG-NEXT: v_dual_mul_f32 v2, s6, v2 :: v_dual_mul_f32 v5, s4, v5 -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v4, s5, v4 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s2 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s3 +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v2, s6, v2 +; GFX1100-SDAG-NEXT: v_dual_mul_f32 v4, s5, v4 :: v_dual_mul_f32 v5, s4, v5 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, 0x42000000, s7 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, v2 -; GFX1100-SDAG-NEXT: v_log_f32_e32 v5, v5 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_3) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v4, v4 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_log_f32_e32 v5, v5 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v6, 0 -; GFX1100-SDAG-NEXT: v_sub_f32_e32 v2, v2, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-SDAG-NEXT: v_dual_sub_f32 v0, v5, v3 :: v_dual_sub_f32 v1, v4, v1 -; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-SDAG-NEXT: v_dual_sub_f32 v2, v2, v0 :: v_dual_sub_f32 v1, v4, v1 +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v5, v3 ; GFX1100-SDAG-NEXT: global_store_b96 v6, v[0:2], s[0:1] ; GFX1100-SDAG-NEXT: s_nop 0 ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -693,21 +692,21 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX1100-GISEL-LABEL: s_log2_v3f32: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s4 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s6 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s4 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s6 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s7 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s8 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s9 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, s8 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 0x42000000, s7 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s7 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, s3 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 0x42000000, s2 ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s9 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s7 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1 @@ -814,45 +813,45 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; SI-SDAG-LABEL: s_log2_v4f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000 -; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 1.0, v3, vcc -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v6, 1.0, v3, vcc -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s1, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v7, 0, v1, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v8, 1.0, v3, vcc -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v3, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v4, s7, v4 -; SI-SDAG-NEXT: v_mul_f32_e32 v6, s6, v6 -; SI-SDAG-NEXT: v_mul_f32_e32 v8, s5, v8 -; SI-SDAG-NEXT: v_mul_f32_e32 v1, s4, v1 +; SI-SDAG-NEXT: v_mul_f32_e32 v4, s3, v4 +; SI-SDAG-NEXT: v_mul_f32_e32 v6, s2, v6 +; SI-SDAG-NEXT: v_mul_f32_e32 v8, s1, v8 +; SI-SDAG-NEXT: v_mul_f32_e32 v1, s0, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v4, v4 ; SI-SDAG-NEXT: v_log_f32_e32 v6, v6 ; SI-SDAG-NEXT: v_log_f32_e32 v8, v8 ; SI-SDAG-NEXT: v_log_f32_e32 v9, v1 -; SI-SDAG-NEXT: s_mov_b32 s2, -1 +; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: v_sub_f32_e32 v3, v4, v2 ; SI-SDAG-NEXT: v_sub_f32_e32 v2, v6, v5 ; SI-SDAG-NEXT: v_sub_f32_e32 v1, v8, v7 ; SI-SDAG-NEXT: v_sub_f32_e32 v0, v9, v0 -; SI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-SDAG-NEXT: s_endpgm ; ; SI-GISEL-LABEL: s_log2_v4f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x42000000 @@ -888,8 +887,8 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; VI-SDAG-LABEL: s_log2_v4f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000 @@ -925,8 +924,8 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; VI-GISEL-LABEL: s_log2_v4f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x42000000 @@ -962,8 +961,8 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; GFX900-SDAG-LABEL: s_log2_v4f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000 @@ -993,13 +992,13 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v7, v6 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v9, v8 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v10, v0 -; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_log2_v4f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x42000000 @@ -1029,41 +1028,42 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX900-GISEL-NEXT: v_sub_f32_e32 v2, v5, v2 ; GFX900-GISEL-NEXT: v_sub_f32_e32 v3, v3, v4 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] +; GFX900-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX900-GISEL-NEXT: s_endpgm ; ; GFX1100-SDAG-LABEL: s_log2_v4f32: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX1100-SDAG-NEXT: s_clause 0x1 +; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, s7 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s1, 0x800000, s6 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s7 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s6 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s4 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s1 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s2 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s3 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v6, 1.0, 0x4f800000, s8 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v7, 1.0, 0x4f800000, s9 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s1 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_dual_mul_f32 v2, s7, v2 :: v_dual_mul_f32 v3, s6, v3 ; GFX1100-SDAG-NEXT: v_dual_mul_f32 v6, s5, v6 :: v_dual_mul_f32 v7, s4, v7 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, s8 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s3 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, v2 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v8, v3 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(TRANS32_DEP_3) +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(TRANS32_DEP_3) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v6, v6 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v7, v7 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, s8 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s9 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v9, 0 ; GFX1100-SDAG-NEXT: v_dual_sub_f32 v3, v2, v0 :: v_dual_sub_f32 v2, v8, v1 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_dual_sub_f32 v1, v6, v4 :: v_dual_sub_f32 v0, v7, v5 -; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: global_store_b128 v9, v[0:3], s[0:1] ; GFX1100-SDAG-NEXT: s_nop 0 ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1072,32 +1072,32 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX1100-GISEL-LABEL: s_log2_v4f32: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s4 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s5 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s10, 0x800000, s6 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s11, 0x800000, s7 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s4 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s6 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s7 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s8 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s9 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s10 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s11 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, s8 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s8 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s9 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, s2 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1 ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v2, s6, v2 :: v_dual_mul_f32 v3, s7, v3 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s9 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s3 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(TRANS32_DEP_3) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v2, v2 ; GFX1100-GISEL-NEXT: v_log_f32_e32 v3, v3 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 0x42000000, s10 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 0x42000000, s11 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 0x42000000, s8 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 0x42000000, s9 ; GFX1100-GISEL-NEXT: v_dual_sub_f32 v0, v0, v4 :: v_dual_sub_f32 v1, v1, v5 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_dual_sub_f32 v2, v2, v6 :: v_dual_sub_f32 v3, v3, v7 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll b/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll index 3d73f84b6e9a8..826862e124920 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll @@ -6,8 +6,8 @@ define amdgpu_kernel void @local_size_x(ptr addrspace(1) %out) { ; SI-LABEL: local_size_x: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x6 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0x6 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -17,12 +17,12 @@ define amdgpu_kernel void @local_size_x(ptr addrspace(1) %out) { ; ; VI-LABEL: local_size_x: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x18 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x18 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -45,8 +45,8 @@ entry: define amdgpu_kernel void @local_size_y(ptr addrspace(1) %out) { ; SI-LABEL: local_size_y: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x7 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0x7 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -56,12 +56,12 @@ define amdgpu_kernel void @local_size_y(ptr addrspace(1) %out) { ; ; VI-LABEL: local_size_y: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x1c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x1c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -84,8 +84,8 @@ entry: define amdgpu_kernel void @local_size_z(ptr addrspace(1) %out) { ; SI-LABEL: local_size_z: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x8 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0x8 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -95,12 +95,12 @@ define amdgpu_kernel void @local_size_z(ptr addrspace(1) %out) { ; ; VI-LABEL: local_size_z: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x20 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x20 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -123,8 +123,8 @@ entry: define amdgpu_kernel void @local_size_xy(ptr addrspace(1) %out) { ; SI-LABEL: local_size_xy: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x6 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x6 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mul_i32 s4, s4, s5 @@ -135,13 +135,13 @@ define amdgpu_kernel void @local_size_xy(ptr addrspace(1) %out) { ; ; VI-LABEL: local_size_xy: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x18 -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x18 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mul_i32 s0, s0, s1 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_mul_i32 s2, s2, s3 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -166,12 +166,12 @@ entry: define amdgpu_kernel void @local_size_xz(ptr addrspace(1) %out) { ; SI-LABEL: local_size_xz: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x6 -; SI-NEXT: s_load_dword s5, s[2:3], 0x8 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0x6 +; SI-NEXT: s_load_dword s4, s[0:1], 0x8 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mul_i32 s4, s4, s5 +; SI-NEXT: s_mul_i32 s4, s2, s4 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -179,11 +179,11 @@ define amdgpu_kernel void @local_size_xz(ptr addrspace(1) %out) { ; ; VI-LABEL: local_size_xz: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x18 -; VI-NEXT: s_load_dword s5, s[2:3], 0x20 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x18 +; VI-NEXT: s_load_dword s3, s[0:1], 0x20 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mul_i32 s2, s4, s5 +; VI-NEXT: s_mul_i32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -211,7 +211,7 @@ entry: define amdgpu_kernel void @local_size_yz(ptr addrspace(1) %out) { ; SI-LABEL: local_size_yz: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x7 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x7 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mul_i32 s0, s0, s1 @@ -224,7 +224,7 @@ define amdgpu_kernel void @local_size_yz(ptr addrspace(1) %out) { ; ; VI-LABEL: local_size_yz: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x1c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x1c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mul_i32 s0, s0, s1 ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -254,13 +254,13 @@ entry: define amdgpu_kernel void @local_size_xyz(ptr addrspace(1) %out) { ; SI-LABEL: local_size_xyz: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x6 -; SI-NEXT: s_load_dword s6, s[2:3], 0x8 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x6 +; SI-NEXT: s_load_dword s2, s[0:1], 0x8 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mul_i32 s2, s4, s5 -; SI-NEXT: s_add_i32 s4, s2, s6 +; SI-NEXT: s_mul_i32 s4, s4, s5 +; SI-NEXT: s_add_i32 s4, s4, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -268,15 +268,15 @@ define amdgpu_kernel void @local_size_xyz(ptr addrspace(1) %out) { ; ; VI-LABEL: local_size_xyz: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x18 -; VI-NEXT: s_load_dword s4, s[2:3], 0x20 -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x18 +; VI-NEXT: s_load_dword s4, s[0:1], 0x20 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mul_i32 s0, s0, s1 -; VI-NEXT: s_add_i32 s0, s0, s4 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_mul_i32 s2, s2, s3 +; VI-NEXT: s_add_i32 s2, s2, s4 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -304,8 +304,8 @@ entry: define amdgpu_kernel void @local_size_x_known_bits(ptr addrspace(1) %out) { ; SI-LABEL: local_size_x_known_bits: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x6 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0x6 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -315,12 +315,12 @@ define amdgpu_kernel void @local_size_x_known_bits(ptr addrspace(1) %out) { ; ; VI-LABEL: local_size_x_known_bits: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x18 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x18 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -345,8 +345,8 @@ entry: define amdgpu_kernel void @local_size_y_known_bits(ptr addrspace(1) %out) { ; SI-LABEL: local_size_y_known_bits: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x7 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0x7 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -356,12 +356,12 @@ define amdgpu_kernel void @local_size_y_known_bits(ptr addrspace(1) %out) { ; ; VI-LABEL: local_size_y_known_bits: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x1c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x1c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -386,8 +386,8 @@ entry: define amdgpu_kernel void @local_size_z_known_bits(ptr addrspace(1) %out) { ; SI-LABEL: local_size_z_known_bits: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x8 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0x8 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -397,12 +397,12 @@ define amdgpu_kernel void @local_size_z_known_bits(ptr addrspace(1) %out) { ; ; VI-LABEL: local_size_z_known_bits: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x20 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x20 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.ll index d5b4f879bf8a0..8196999b8f1f1 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.round.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.round.ll @@ -8,8 +8,8 @@ define amdgpu_kernel void @round_f32(ptr addrspace(1) %out, float %x) #0 { ; GFX6-LABEL: round_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s6, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s6, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -24,39 +24,57 @@ define amdgpu_kernel void @round_f32(ptr addrspace(1) %out, float %x) #0 { ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX89-LABEL: round_f32: -; GFX89: ; %bb.0: -; GFX89-NEXT: s_load_dword s6, s[2:3], 0x2c -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX89-NEXT: s_mov_b32 s3, 0xf000 -; GFX89-NEXT: s_mov_b32 s2, -1 -; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: v_trunc_f32_e32 v0, s6 -; GFX89-NEXT: v_sub_f32_e32 v1, s6, v0 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5] -; GFX89-NEXT: s_brev_b32 s4, -2 -; GFX89-NEXT: v_mov_b32_e32 v2, s6 -; GFX89-NEXT: v_bfi_b32 v1, s4, v1, v2 -; GFX89-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX89-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX89-NEXT: s_endpgm +; GFX8-LABEL: round_f32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_trunc_f32_e32 v0, s6 +; GFX8-NEXT: v_sub_f32_e32 v1, s6, v0 +; GFX8-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5] +; GFX8-NEXT: s_brev_b32 s4, -2 +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v2 +; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: round_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_trunc_f32_e32 v0, s2 +; GFX9-NEXT: v_sub_f32_e32 v1, s2, v0 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] +; GFX9-NEXT: s_brev_b32 s0, -2 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_bfi_b32 v1, s0, v1, v2 +; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: round_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_trunc_f32_e32 v0, s4 +; GFX11-NEXT: v_trunc_f32_e32 v0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_sub_f32_e32 v1, s4, v0 -; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v1|, 0.5 +; GFX11-NEXT: v_sub_f32_e32 v1, s2, v0 +; GFX11-NEXT: v_cmp_ge_f32_e64 s3, |v1|, 0.5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s3 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, s2 ; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -91,7 +109,7 @@ define amdgpu_kernel void @round_f32(ptr addrspace(1) %out, float %x) #0 { define amdgpu_kernel void @round_v2f32(ptr addrspace(1) %out, <2 x float> %in) #0 { ; GFX6-LABEL: round_v2f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_brev_b32 s8, -2 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 @@ -115,35 +133,61 @@ define amdgpu_kernel void @round_v2f32(ptr addrspace(1) %out, <2 x float> %in) # ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; -; GFX89-LABEL: round_v2f32: -; GFX89: ; %bb.0: -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX89-NEXT: s_brev_b32 s8, -2 -; GFX89-NEXT: s_mov_b32 s7, 0xf000 -; GFX89-NEXT: s_mov_b32 s6, -1 -; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: v_trunc_f32_e32 v0, s3 -; GFX89-NEXT: v_sub_f32_e32 v1, s3, v0 -; GFX89-NEXT: s_mov_b32 s4, s0 -; GFX89-NEXT: s_mov_b32 s5, s1 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] -; GFX89-NEXT: v_mov_b32_e32 v2, s3 -; GFX89-NEXT: v_bfi_b32 v1, s8, v1, v2 -; GFX89-NEXT: v_add_f32_e32 v1, v0, v1 -; GFX89-NEXT: v_trunc_f32_e32 v0, s2 -; GFX89-NEXT: v_sub_f32_e32 v2, s2, v0 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[0:1] -; GFX89-NEXT: v_mov_b32_e32 v3, s2 -; GFX89-NEXT: v_bfi_b32 v2, s8, v2, v3 -; GFX89-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; GFX89-NEXT: s_endpgm +; GFX8-LABEL: round_v2f32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_brev_b32 s8, -2 +; GFX8-NEXT: s_mov_b32 s7, 0xf000 +; GFX8-NEXT: s_mov_b32 s6, -1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_trunc_f32_e32 v0, s3 +; GFX8-NEXT: v_sub_f32_e32 v1, s3, v0 +; GFX8-NEXT: s_mov_b32 s4, s0 +; GFX8-NEXT: s_mov_b32 s5, s1 +; GFX8-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, s3 +; GFX8-NEXT: v_bfi_b32 v1, s8, v1, v2 +; GFX8-NEXT: v_add_f32_e32 v1, v0, v1 +; GFX8-NEXT: v_trunc_f32_e32 v0, s2 +; GFX8-NEXT: v_sub_f32_e32 v2, s2, v0 +; GFX8-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, 0.5 +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_bfi_b32 v2, s8, v2, v3 +; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: round_v2f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_brev_b32 s8, -2 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_trunc_f32_e32 v0, s7 +; GFX9-NEXT: v_sub_f32_e32 v1, s7, v0 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: v_bfi_b32 v1, s8, v1, v2 +; GFX9-NEXT: v_add_f32_e32 v1, v0, v1 +; GFX9-NEXT: v_trunc_f32_e32 v0, s6 +; GFX9-NEXT: v_sub_f32_e32 v2, s6, v0 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: v_bfi_b32 v2, s8, v2, v3 +; GFX9-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: round_v2f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_trunc_f32_e32 v0, s3 ; GFX11-NEXT: v_trunc_f32_e32 v2, s2 @@ -198,8 +242,8 @@ define amdgpu_kernel void @round_v2f32(ptr addrspace(1) %out, <2 x float> %in) # define amdgpu_kernel void @round_v4f32(ptr addrspace(1) %out, <4 x float> %in) #0 { ; GFX6-LABEL: round_v4f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_brev_b32 s10, -2 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -235,50 +279,89 @@ define amdgpu_kernel void @round_v4f32(ptr addrspace(1) %out, <4 x float> %in) # ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX89-LABEL: round_v4f32: -; GFX89: ; %bb.0: -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX89-NEXT: s_brev_b32 s10, -2 -; GFX89-NEXT: s_mov_b32 s3, 0xf000 -; GFX89-NEXT: s_mov_b32 s2, -1 -; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: v_trunc_f32_e32 v0, s7 -; GFX89-NEXT: v_sub_f32_e32 v1, s7, v0 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[8:9] -; GFX89-NEXT: v_mov_b32_e32 v2, s7 -; GFX89-NEXT: v_bfi_b32 v1, s10, v1, v2 -; GFX89-NEXT: v_add_f32_e32 v3, v0, v1 -; GFX89-NEXT: v_trunc_f32_e32 v0, s6 -; GFX89-NEXT: v_sub_f32_e32 v1, s6, v0 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[8:9] -; GFX89-NEXT: v_mov_b32_e32 v2, s6 -; GFX89-NEXT: v_bfi_b32 v1, s10, v1, v2 -; GFX89-NEXT: v_add_f32_e32 v2, v0, v1 -; GFX89-NEXT: v_trunc_f32_e32 v0, s5 -; GFX89-NEXT: v_sub_f32_e32 v1, s5, v0 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[6:7] -; GFX89-NEXT: v_mov_b32_e32 v4, s5 -; GFX89-NEXT: v_bfi_b32 v1, s10, v1, v4 -; GFX89-NEXT: v_add_f32_e32 v1, v0, v1 -; GFX89-NEXT: v_trunc_f32_e32 v0, s4 -; GFX89-NEXT: v_sub_f32_e32 v4, s4, v0 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[6:7], |v4|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[6:7] -; GFX89-NEXT: v_mov_b32_e32 v5, s4 -; GFX89-NEXT: v_bfi_b32 v4, s10, v4, v5 -; GFX89-NEXT: v_add_f32_e32 v0, v0, v4 -; GFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 -; GFX89-NEXT: s_endpgm +; GFX8-LABEL: round_v4f32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_brev_b32 s10, -2 +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_trunc_f32_e32 v0, s7 +; GFX8-NEXT: v_sub_f32_e32 v1, s7, v0 +; GFX8-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[8:9] +; GFX8-NEXT: v_mov_b32_e32 v2, s7 +; GFX8-NEXT: v_bfi_b32 v1, s10, v1, v2 +; GFX8-NEXT: v_add_f32_e32 v3, v0, v1 +; GFX8-NEXT: v_trunc_f32_e32 v0, s6 +; GFX8-NEXT: v_sub_f32_e32 v1, s6, v0 +; GFX8-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[8:9] +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_bfi_b32 v1, s10, v1, v2 +; GFX8-NEXT: v_add_f32_e32 v2, v0, v1 +; GFX8-NEXT: v_trunc_f32_e32 v0, s5 +; GFX8-NEXT: v_sub_f32_e32 v1, s5, v0 +; GFX8-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, 0.5 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: v_bfi_b32 v1, s10, v1, v4 +; GFX8-NEXT: v_add_f32_e32 v1, v0, v1 +; GFX8-NEXT: v_trunc_f32_e32 v0, s4 +; GFX8-NEXT: v_sub_f32_e32 v4, s4, v0 +; GFX8-NEXT: v_cmp_ge_f32_e64 s[6:7], |v4|, 0.5 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: v_bfi_b32 v4, s10, v4, v5 +; GFX8-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX8-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: round_v4f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; GFX9-NEXT: s_brev_b32 s2, -2 +; GFX9-NEXT: s_mov_b32 s11, 0xf000 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_trunc_f32_e32 v0, s7 +; GFX9-NEXT: v_sub_f32_e32 v1, s7, v0 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: v_bfi_b32 v1, s2, v1, v2 +; GFX9-NEXT: v_add_f32_e32 v3, v0, v1 +; GFX9-NEXT: v_trunc_f32_e32 v0, s6 +; GFX9-NEXT: v_sub_f32_e32 v1, s6, v0 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_bfi_b32 v1, s2, v1, v2 +; GFX9-NEXT: v_add_f32_e32 v2, v0, v1 +; GFX9-NEXT: v_trunc_f32_e32 v0, s5 +; GFX9-NEXT: v_sub_f32_e32 v1, s5, v0 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v4, s5 +; GFX9-NEXT: v_bfi_b32 v1, s2, v1, v4 +; GFX9-NEXT: v_add_f32_e32 v1, v0, v1 +; GFX9-NEXT: v_trunc_f32_e32 v0, s4 +; GFX9-NEXT: v_sub_f32_e32 v4, s4, v0 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, 0.5 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v5, s4 +; GFX9-NEXT: v_bfi_b32 v4, s2, v4, v5 +; GFX9-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: round_v4f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_trunc_f32_e32 v0, s7 @@ -355,145 +438,145 @@ define amdgpu_kernel void @round_v4f32(ptr addrspace(1) %out, <4 x float> %in) # define amdgpu_kernel void @round_v8f32(ptr addrspace(1) %out, <8 x float> %in) #0 { ; GFX6-LABEL: round_v8f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x11 -; GFX6-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 -; GFX6-NEXT: s_brev_b32 s2, -2 -; GFX6-NEXT: s_mov_b32 s15, 0xf000 -; GFX6-NEXT: s_mov_b32 s14, -1 +; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 +; GFX6-NEXT: s_brev_b32 s14, -2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_trunc_f32_e32 v0, s7 ; GFX6-NEXT: v_sub_f32_e32 v1, s7, v0 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 -; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] +; GFX6-NEXT: v_cmp_ge_f32_e64 s[12:13], |v1|, 0.5 +; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[12:13] ; GFX6-NEXT: v_mov_b32_e32 v2, s7 -; GFX6-NEXT: v_bfi_b32 v1, s2, v1, v2 +; GFX6-NEXT: v_bfi_b32 v1, s14, v1, v2 ; GFX6-NEXT: v_add_f32_e32 v3, v0, v1 ; GFX6-NEXT: v_trunc_f32_e32 v0, s6 ; GFX6-NEXT: v_sub_f32_e32 v1, s6, v0 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 -; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] +; GFX6-NEXT: v_cmp_ge_f32_e64 s[12:13], |v1|, 0.5 +; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[12:13] ; GFX6-NEXT: v_mov_b32_e32 v2, s6 -; GFX6-NEXT: v_bfi_b32 v1, s2, v1, v2 +; GFX6-NEXT: v_bfi_b32 v1, s14, v1, v2 ; GFX6-NEXT: v_add_f32_e32 v2, v0, v1 ; GFX6-NEXT: v_trunc_f32_e32 v0, s5 ; GFX6-NEXT: v_sub_f32_e32 v1, s5, v0 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 -; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] +; GFX6-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, 0.5 +; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[6:7] ; GFX6-NEXT: v_mov_b32_e32 v4, s5 -; GFX6-NEXT: v_bfi_b32 v1, s2, v1, v4 +; GFX6-NEXT: v_bfi_b32 v1, s14, v1, v4 ; GFX6-NEXT: v_add_f32_e32 v1, v0, v1 ; GFX6-NEXT: v_trunc_f32_e32 v0, s4 ; GFX6-NEXT: v_sub_f32_e32 v4, s4, v0 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, 0.5 -; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[0:1] +; GFX6-NEXT: v_cmp_ge_f32_e64 s[6:7], |v4|, 0.5 +; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[6:7] ; GFX6-NEXT: v_mov_b32_e32 v5, s4 -; GFX6-NEXT: v_bfi_b32 v4, s2, v4, v5 +; GFX6-NEXT: v_bfi_b32 v4, s14, v4, v5 ; GFX6-NEXT: v_add_f32_e32 v0, v0, v4 ; GFX6-NEXT: v_trunc_f32_e32 v4, s11 ; GFX6-NEXT: v_sub_f32_e32 v5, s11, v4 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5 -; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1] +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5 +; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v6, s11 -; GFX6-NEXT: v_bfi_b32 v5, s2, v5, v6 +; GFX6-NEXT: v_bfi_b32 v5, s14, v5, v6 ; GFX6-NEXT: v_add_f32_e32 v7, v4, v5 ; GFX6-NEXT: v_trunc_f32_e32 v4, s10 ; GFX6-NEXT: v_sub_f32_e32 v5, s10, v4 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5 -; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1] +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5 +; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v6, s10 -; GFX6-NEXT: v_bfi_b32 v5, s2, v5, v6 +; GFX6-NEXT: v_bfi_b32 v5, s14, v5, v6 ; GFX6-NEXT: v_add_f32_e32 v6, v4, v5 ; GFX6-NEXT: v_trunc_f32_e32 v4, s9 ; GFX6-NEXT: v_sub_f32_e32 v5, s9, v4 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5 -; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1] +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5 +; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v8, s9 -; GFX6-NEXT: v_bfi_b32 v5, s2, v5, v8 +; GFX6-NEXT: v_bfi_b32 v5, s14, v5, v8 ; GFX6-NEXT: v_add_f32_e32 v5, v4, v5 ; GFX6-NEXT: v_trunc_f32_e32 v4, s8 ; GFX6-NEXT: v_sub_f32_e32 v8, s8, v4 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v8|, 0.5 -; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, 1.0, s[0:1] +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v8|, 0.5 +; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, 1.0, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v9, s8 -; GFX6-NEXT: v_bfi_b32 v8, s2, v8, v9 +; GFX6-NEXT: v_bfi_b32 v8, s14, v8, v9 ; GFX6-NEXT: v_add_f32_e32 v4, v4, v8 -; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX89-LABEL: round_v8f32: ; GFX89: ; %bb.0: -; GFX89-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 -; GFX89-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x24 -; GFX89-NEXT: s_brev_b32 s2, -2 -; GFX89-NEXT: s_mov_b32 s15, 0xf000 -; GFX89-NEXT: s_mov_b32 s14, -1 +; GFX89-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 +; GFX89-NEXT: s_brev_b32 s14, -2 +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX89-NEXT: s_mov_b32 s3, 0xf000 +; GFX89-NEXT: s_mov_b32 s2, -1 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_trunc_f32_e32 v0, s7 ; GFX89-NEXT: v_sub_f32_e32 v1, s7, v0 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] +; GFX89-NEXT: v_cmp_ge_f32_e64 s[12:13], |v1|, 0.5 +; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[12:13] ; GFX89-NEXT: v_mov_b32_e32 v2, s7 -; GFX89-NEXT: v_bfi_b32 v1, s2, v1, v2 +; GFX89-NEXT: v_bfi_b32 v1, s14, v1, v2 ; GFX89-NEXT: v_add_f32_e32 v3, v0, v1 ; GFX89-NEXT: v_trunc_f32_e32 v0, s6 ; GFX89-NEXT: v_sub_f32_e32 v1, s6, v0 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] +; GFX89-NEXT: v_cmp_ge_f32_e64 s[12:13], |v1|, 0.5 +; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[12:13] ; GFX89-NEXT: v_mov_b32_e32 v2, s6 -; GFX89-NEXT: v_bfi_b32 v1, s2, v1, v2 +; GFX89-NEXT: v_bfi_b32 v1, s14, v1, v2 ; GFX89-NEXT: v_add_f32_e32 v2, v0, v1 ; GFX89-NEXT: v_trunc_f32_e32 v0, s5 ; GFX89-NEXT: v_sub_f32_e32 v1, s5, v0 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] +; GFX89-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, 0.5 +; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[6:7] ; GFX89-NEXT: v_mov_b32_e32 v4, s5 -; GFX89-NEXT: v_bfi_b32 v1, s2, v1, v4 +; GFX89-NEXT: v_bfi_b32 v1, s14, v1, v4 ; GFX89-NEXT: v_add_f32_e32 v1, v0, v1 ; GFX89-NEXT: v_trunc_f32_e32 v0, s4 ; GFX89-NEXT: v_sub_f32_e32 v4, s4, v0 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[0:1] +; GFX89-NEXT: v_cmp_ge_f32_e64 s[6:7], |v4|, 0.5 +; GFX89-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[6:7] ; GFX89-NEXT: v_mov_b32_e32 v5, s4 -; GFX89-NEXT: v_bfi_b32 v4, s2, v4, v5 +; GFX89-NEXT: v_bfi_b32 v4, s14, v4, v5 ; GFX89-NEXT: v_add_f32_e32 v0, v0, v4 ; GFX89-NEXT: v_trunc_f32_e32 v4, s11 ; GFX89-NEXT: v_sub_f32_e32 v5, s11, v4 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1] +; GFX89-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5 +; GFX89-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5] ; GFX89-NEXT: v_mov_b32_e32 v6, s11 -; GFX89-NEXT: v_bfi_b32 v5, s2, v5, v6 +; GFX89-NEXT: v_bfi_b32 v5, s14, v5, v6 ; GFX89-NEXT: v_add_f32_e32 v7, v4, v5 ; GFX89-NEXT: v_trunc_f32_e32 v4, s10 ; GFX89-NEXT: v_sub_f32_e32 v5, s10, v4 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1] +; GFX89-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5 +; GFX89-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5] ; GFX89-NEXT: v_mov_b32_e32 v6, s10 -; GFX89-NEXT: v_bfi_b32 v5, s2, v5, v6 +; GFX89-NEXT: v_bfi_b32 v5, s14, v5, v6 ; GFX89-NEXT: v_add_f32_e32 v6, v4, v5 ; GFX89-NEXT: v_trunc_f32_e32 v4, s9 ; GFX89-NEXT: v_sub_f32_e32 v5, s9, v4 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1] +; GFX89-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5 +; GFX89-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5] ; GFX89-NEXT: v_mov_b32_e32 v8, s9 -; GFX89-NEXT: v_bfi_b32 v5, s2, v5, v8 +; GFX89-NEXT: v_bfi_b32 v5, s14, v5, v8 ; GFX89-NEXT: v_add_f32_e32 v5, v4, v5 ; GFX89-NEXT: v_trunc_f32_e32 v4, s8 ; GFX89-NEXT: v_sub_f32_e32 v8, s8, v4 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v8|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v8, 0, 1.0, s[0:1] +; GFX89-NEXT: v_cmp_ge_f32_e64 s[4:5], |v8|, 0.5 +; GFX89-NEXT: v_cndmask_b32_e64 v8, 0, 1.0, s[4:5] ; GFX89-NEXT: v_mov_b32_e32 v9, s8 -; GFX89-NEXT: v_bfi_b32 v8, s2, v8, v9 +; GFX89-NEXT: v_bfi_b32 v8, s14, v8, v9 ; GFX89-NEXT: v_add_f32_e32 v4, v4, v8 -; GFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16 -; GFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; GFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; GFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX89-NEXT: s_endpgm ; ; GFX11-LABEL: round_v8f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x44 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x44 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_trunc_f32_e32 v0, s7 @@ -628,10 +711,10 @@ define amdgpu_kernel void @round_v8f32(ptr addrspace(1) %out, <8 x float> %in) # define amdgpu_kernel void @round_f16(ptr addrspace(1) %out, i32 %x.arg) #0 { ; GFX6-LABEL: round_f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s0, s[2:3], 0xb +; GFX6-NEXT: s_load_dword s2, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, s0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, s2 ; GFX6-NEXT: v_trunc_f32_e32 v1, v0 ; GFX6-NEXT: v_sub_f32_e32 v2, v0, v1 ; GFX6-NEXT: v_cmp_ge_f32_e64 s[2:3], |v2|, 0.5 @@ -642,44 +725,62 @@ define amdgpu_kernel void @round_f16(ptr addrspace(1) %out, i32 %x.arg) #0 { ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX89-LABEL: round_f16: -; GFX89: ; %bb.0: -; GFX89-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX89-NEXT: v_mov_b32_e32 v0, 0x3c00 -; GFX89-NEXT: s_movk_i32 s5, 0x7fff -; GFX89-NEXT: s_mov_b32 s3, 0xf000 -; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: v_trunc_f16_e32 v1, s4 -; GFX89-NEXT: v_sub_f16_e32 v2, s4, v1 -; GFX89-NEXT: v_cmp_ge_f16_e64 vcc, |v2|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX89-NEXT: v_mov_b32_e32 v2, s4 -; GFX89-NEXT: v_bfi_b32 v0, s5, v0, v2 -; GFX89-NEXT: s_mov_b32 s2, -1 -; GFX89-NEXT: v_add_f16_e32 v0, v1, v0 -; GFX89-NEXT: buffer_store_short v0, off, s[0:3], 0 -; GFX89-NEXT: s_endpgm +; GFX8-LABEL: round_f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: v_mov_b32_e32 v0, 0x3c00 +; GFX8-NEXT: s_movk_i32 s5, 0x7fff +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_trunc_f16_e32 v1, s4 +; GFX8-NEXT: v_sub_f16_e32 v2, s4, v1 +; GFX8-NEXT: v_cmp_ge_f16_e64 vcc, |v2|, 0.5 +; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_bfi_b32 v0, s5, v0, v2 +; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: v_add_f16_e32 v0, v1, v0 +; GFX8-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: round_f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x3c00 +; GFX9-NEXT: s_movk_i32 s0, 0x7fff +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_trunc_f16_e32 v1, s2 +; GFX9-NEXT: v_sub_f16_e32 v2, s2, v1 +; GFX9-NEXT: v_cmp_ge_f16_e64 vcc, |v2|, 0.5 +; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_bfi_b32 v0, s0, v0, v2 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: v_add_f16_e32 v0, v1, v0 +; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: round_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_trunc_f16_e32 v0, s4 +; GFX11-NEXT: v_trunc_f16_e32 v0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_sub_f16_e32 v1, s4, v0 -; GFX11-NEXT: v_cmp_ge_f16_e64 s2, |v1|, 0.5 +; GFX11-NEXT: v_sub_f16_e32 v1, s2, v0 +; GFX11-NEXT: v_cmp_ge_f16_e64 s3, |v1|, 0.5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x3c00, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x3c00, s3 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, v1, s2 ; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, v1, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_f16_e32 v0, v0, v1 ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 @@ -723,13 +824,13 @@ define amdgpu_kernel void @round_f16(ptr addrspace(1) %out, i32 %x.arg) #0 { define amdgpu_kernel void @round_v2f16(ptr addrspace(1) %out, i32 %in.arg) #0 { ; GFX6-LABEL: round_v2f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s0, s[2:3], 0xb +; GFX6-NEXT: s_load_dword s2, s[0:1], 0xb ; GFX6-NEXT: s_brev_b32 s4, -2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshr_b32 s1, s0, 16 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, s1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, s0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_lshr_b32 s3, s2, 16 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, s3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, s2 ; GFX6-NEXT: v_trunc_f32_e32 v3, v1 ; GFX6-NEXT: v_sub_f32_e32 v5, v1, v3 ; GFX6-NEXT: v_trunc_f32_e32 v2, v0 @@ -748,14 +849,13 @@ define amdgpu_kernel void @round_v2f16(ptr addrspace(1) %out, i32 %in.arg) #0 { ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: round_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v0, 0x3c00 ; GFX8-NEXT: s_movk_i32 s6, 0x7fff ; GFX8-NEXT: s_mov_b32 s3, 0xf000 @@ -782,57 +882,57 @@ define amdgpu_kernel void @round_v2f16(ptr addrspace(1) %out, i32 %in.arg) #0 { ; ; GFX9-LABEL: round_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x3c00 -; GFX9-NEXT: s_movk_i32 s6, 0x7fff -; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_movk_i32 s1, 0x7fff +; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s5, s4, 16 -; GFX9-NEXT: v_trunc_f16_e32 v1, s5 -; GFX9-NEXT: v_sub_f16_e32 v2, s5, v1 +; GFX9-NEXT: s_lshr_b32 s0, s2, 16 +; GFX9-NEXT: v_trunc_f16_e32 v1, s0 +; GFX9-NEXT: v_sub_f16_e32 v2, s0, v1 ; GFX9-NEXT: v_cmp_ge_f16_e64 vcc, |v2|, 0.5 ; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v0, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: v_bfi_b32 v2, s6, v2, v3 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: v_bfi_b32 v2, s1, v2, v3 ; GFX9-NEXT: v_add_f16_e32 v1, v1, v2 -; GFX9-NEXT: v_trunc_f16_e32 v2, s4 -; GFX9-NEXT: v_sub_f16_e32 v3, s4, v2 +; GFX9-NEXT: v_trunc_f16_e32 v2, s2 +; GFX9-NEXT: v_sub_f16_e32 v3, s2, v2 ; GFX9-NEXT: v_cmp_ge_f16_e64 vcc, |v3|, 0.5 ; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: v_bfi_b32 v0, s6, v0, v3 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_bfi_b32 v0, s1, v0, v3 ; GFX9-NEXT: v_add_f16_e32 v0, v2, v0 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: round_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s5, s4, 16 -; GFX11-NEXT: v_trunc_f16_e32 v1, s4 -; GFX11-NEXT: v_trunc_f16_e32 v0, s5 +; GFX11-NEXT: s_lshr_b32 s3, s2, 16 +; GFX11-NEXT: v_trunc_f16_e32 v1, s2 +; GFX11-NEXT: v_trunc_f16_e32 v0, s3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_sub_f16_e32 v3, s4, v1 -; GFX11-NEXT: v_sub_f16_e32 v2, s5, v0 +; GFX11-NEXT: v_sub_f16_e32 v3, s2, v1 +; GFX11-NEXT: v_sub_f16_e32 v2, s3, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cmp_ge_f16_e64 s2, |v2|, 0.5 -; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x3c00, s2 +; GFX11-NEXT: v_cmp_ge_f16_e64 s4, |v2|, 0.5 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x3c00, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_ge_f16_e64 s2, |v3|, 0.5 -; GFX11-NEXT: v_bfi_b32 v2, 0x7fff, v2, s5 +; GFX11-NEXT: v_cmp_ge_f16_e64 s4, |v3|, 0.5 +; GFX11-NEXT: v_bfi_b32 v2, 0x7fff, v2, s3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 0x3c00, s2 -; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 0x3c00, s4 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: v_add_f16_e32 v0, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_bfi_b32 v3, 0x7fff, v3, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_bfi_b32 v3, 0x7fff, v3, s2 +; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: v_add_f16_e32 v1, v1, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0 diff --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll index e8ac1b2887c36..a54405bf1b471 100644 --- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll @@ -8,17 +8,17 @@ define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 { ; GFX9-LABEL: s_lshr_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_pk_lshrrev_b16 v1, s3, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_pk_lshrrev_b16 v1, s7, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: s_lshr_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s4, s2, 0xffff ; VI-NEXT: s_lshr_b32 s2, s2, 16 @@ -35,7 +35,7 @@ define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, < ; ; CI-LABEL: s_lshr_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -54,16 +54,16 @@ define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, < ; ; GFX10-LABEL: s_lshr_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_lshrrev_b16 v1, s3, s2 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: v_pk_lshrrev_b16 v1, s7, s6 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_lshr_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_lshrrev_b16 v1, s3, s2 @@ -79,7 +79,7 @@ define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, < define amdgpu_kernel void @v_lshr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: v_lshr_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -90,7 +90,7 @@ define amdgpu_kernel void @v_lshr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; VI-LABEL: v_lshr_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -109,7 +109,7 @@ define amdgpu_kernel void @v_lshr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; CI-LABEL: v_lshr_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -131,7 +131,7 @@ define amdgpu_kernel void @v_lshr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX10-LABEL: v_lshr_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -142,9 +142,7 @@ define amdgpu_kernel void @v_lshr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX11-LABEL: v_lshr_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] @@ -169,20 +167,20 @@ define amdgpu_kernel void @v_lshr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @lshr_v_s_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in, <2 x i16> %sgpr) #0 { ; GFX9-LABEL: lshr_v_s_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshrrev_b16 v1, s0, v1 +; GFX9-NEXT: v_pk_lshrrev_b16 v1, s2, v1 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: lshr_v_s_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -203,22 +201,22 @@ define amdgpu_kernel void @lshr_v_s_v2i16(ptr addrspace(1) %out, ptr addrspace(1 ; ; CI-LABEL: lshr_v_s_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dword s0, s[2:3], 0xd -; CI-NEXT: s_mov_b32 s11, 0xf000 -; CI-NEXT: s_mov_b32 s10, 0 +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dword s8, s[0:1], 0xd +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[8:9], s[6:7] +; CI-NEXT: s_mov_b64 s[0:1], s[6:7] ; CI-NEXT: v_mov_b32_e32 v1, 0 -; CI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; CI-NEXT: s_lshr_b32 s1, s0, 16 -; CI-NEXT: s_mov_b64 s[6:7], s[10:11] +; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 +; CI-NEXT: s_lshr_b32 s0, s8, 16 +; CI-NEXT: s_mov_b64 s[6:7], s[2:3] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; CI-NEXT: v_lshrrev_b32_e32 v3, s1, v3 -; CI-NEXT: v_lshrrev_b32_e32 v2, s0, v2 +; CI-NEXT: v_lshrrev_b32_e32 v3, s0, v3 +; CI-NEXT: v_lshrrev_b32_e32 v2, s8, v2 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 @@ -226,10 +224,9 @@ define amdgpu_kernel void @lshr_v_s_v2i16(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX10-LABEL: lshr_v_s_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -239,12 +236,9 @@ define amdgpu_kernel void @lshr_v_s_v2i16(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX11-LABEL: lshr_v_s_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -266,20 +260,20 @@ define amdgpu_kernel void @lshr_v_s_v2i16(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @lshr_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in, <2 x i16> %sgpr) #0 { ; GFX9-LABEL: lshr_s_v_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshrrev_b16 v1, v1, s0 +; GFX9-NEXT: v_pk_lshrrev_b16 v1, v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: lshr_s_v_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -300,22 +294,22 @@ define amdgpu_kernel void @lshr_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1 ; ; CI-LABEL: lshr_s_v_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dword s0, s[2:3], 0xd -; CI-NEXT: s_mov_b32 s11, 0xf000 -; CI-NEXT: s_mov_b32 s10, 0 +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dword s8, s[0:1], 0xd +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[8:9], s[6:7] +; CI-NEXT: s_mov_b64 s[0:1], s[6:7] ; CI-NEXT: v_mov_b32_e32 v1, 0 -; CI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; CI-NEXT: s_lshr_b32 s1, s0, 16 -; CI-NEXT: s_and_b32 s0, s0, 0xffff -; CI-NEXT: s_mov_b64 s[6:7], s[10:11] +; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 +; CI-NEXT: s_lshr_b32 s0, s8, 16 +; CI-NEXT: s_and_b32 s1, s8, 0xffff +; CI-NEXT: s_mov_b64 s[6:7], s[2:3] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; CI-NEXT: v_lshr_b32_e32 v3, s1, v3 -; CI-NEXT: v_lshr_b32_e32 v2, s0, v2 +; CI-NEXT: v_lshr_b32_e32 v3, s0, v3 +; CI-NEXT: v_lshr_b32_e32 v2, s1, v2 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 @@ -323,10 +317,9 @@ define amdgpu_kernel void @lshr_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX10-LABEL: lshr_s_v_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -336,12 +329,9 @@ define amdgpu_kernel void @lshr_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX11-LABEL: lshr_s_v_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -363,7 +353,7 @@ define amdgpu_kernel void @lshr_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @lshr_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: lshr_imm_v_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -374,7 +364,7 @@ define amdgpu_kernel void @lshr_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace ; ; VI-LABEL: lshr_imm_v_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: v_mov_b32_e32 v4, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -394,7 +384,7 @@ define amdgpu_kernel void @lshr_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace ; ; CI-LABEL: lshr_imm_v_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -414,7 +404,7 @@ define amdgpu_kernel void @lshr_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace ; ; GFX10-LABEL: lshr_imm_v_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -425,9 +415,7 @@ define amdgpu_kernel void @lshr_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace ; ; GFX11-LABEL: lshr_imm_v_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -450,7 +438,7 @@ define amdgpu_kernel void @lshr_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @lshr_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: lshr_v_imm_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -461,7 +449,7 @@ define amdgpu_kernel void @lshr_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace ; ; VI-LABEL: lshr_v_imm_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -480,7 +468,7 @@ define amdgpu_kernel void @lshr_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace ; ; CI-LABEL: lshr_v_imm_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -497,7 +485,7 @@ define amdgpu_kernel void @lshr_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace ; ; GFX10-LABEL: lshr_v_imm_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -508,9 +496,7 @@ define amdgpu_kernel void @lshr_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace ; ; GFX11-LABEL: lshr_v_imm_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -533,7 +519,7 @@ define amdgpu_kernel void @lshr_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @v_lshr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: v_lshr_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] @@ -545,7 +531,7 @@ define amdgpu_kernel void @v_lshr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; VI-LABEL: v_lshr_v4i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -567,7 +553,7 @@ define amdgpu_kernel void @v_lshr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; CI-LABEL: v_lshr_v4i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 @@ -596,7 +582,7 @@ define amdgpu_kernel void @v_lshr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX10-LABEL: v_lshr_v4i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] @@ -608,9 +594,7 @@ define amdgpu_kernel void @v_lshr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX11-LABEL: v_lshr_v4i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] @@ -636,7 +620,7 @@ define amdgpu_kernel void @v_lshr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @lshr_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: lshr_v_imm_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -648,7 +632,7 @@ define amdgpu_kernel void @lshr_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace ; ; VI-LABEL: lshr_v_imm_v4i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -670,7 +654,7 @@ define amdgpu_kernel void @lshr_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace ; ; CI-LABEL: lshr_v_imm_v4i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -689,7 +673,7 @@ define amdgpu_kernel void @lshr_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace ; ; GFX10-LABEL: lshr_v_imm_v4i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -701,9 +685,7 @@ define amdgpu_kernel void @lshr_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace ; ; GFX11-LABEL: lshr_v_imm_v4i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/madak.ll b/llvm/test/CodeGen/AMDGPU/madak.ll index b8b4d4440d580..944db3d3adc3a 100644 --- a/llvm/test/CodeGen/AMDGPU/madak.ll +++ b/llvm/test/CodeGen/AMDGPU/madak.ll @@ -15,18 +15,18 @@ declare float @llvm.fabs.f32(float) nounwind readnone define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in.a, ptr addrspace(1) noalias %in.b) #0 { ; GFX6-LABEL: madak_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX6-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NEXT: s_mov_b32 s10, 0 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX6-NEXT: s_mov_b64 s[0:1], s[6:7] ; GFX6-NEXT: v_mov_b32_e32 v1, 0 -; GFX6-NEXT: s_mov_b64 s[2:3], s[10:11] -; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 -; GFX6-NEXT: s_mov_b64 s[6:7], s[10:11] +; GFX6-NEXT: s_mov_b64 s[10:11], s[2:3] +; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 +; GFX6-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_madak_f32 v2, v2, v3, 0x41200000 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 @@ -34,8 +34,8 @@ define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspac ; ; GFX8-LABEL: madak_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s7 @@ -56,12 +56,12 @@ define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspac ; ; GFX9-LABEL: madak_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_madak_f32 v1, v1, v2, 0x41200000 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -70,13 +70,13 @@ define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspac ; GFX10-MAD-LABEL: madak_f32: ; GFX10-MAD: ; %bb.0: ; GFX10-MAD-NEXT: s_clause 0x1 -; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-MAD-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-MAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-MAD-NEXT: s_clause 0x1 ; GFX10-MAD-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-MAD-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-MAD-NEXT: global_load_dword v2, v0, s[2:3] ; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) ; GFX10-MAD-NEXT: v_madak_f32 v1, v1, v2, 0x41200000 ; GFX10-MAD-NEXT: global_store_dword v0, v1, s[4:5] @@ -85,10 +85,8 @@ define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspac ; GFX11-MAD-LABEL: madak_f32: ; GFX11-MAD: ; %bb.0: ; GFX11-MAD-NEXT: s_clause 0x1 -; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-MAD-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-MAD-NEXT: s_clause 0x1 @@ -105,13 +103,12 @@ define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspac ; ; GFX940-FMA-LABEL: madak_f32: ; GFX940-FMA: ; %bb.0: -; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-FMA-NEXT: global_load_dword v1, v0, s[6:7] -; GFX940-FMA-NEXT: global_load_dword v2, v0, s[0:1] +; GFX940-FMA-NEXT: global_load_dword v2, v0, s[2:3] ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX940-FMA-NEXT: v_fmaak_f32 v1, v1, v2, 0x41200000 ; GFX940-FMA-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 @@ -120,13 +117,13 @@ define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspac ; GFX10-FMA-LABEL: madak_f32: ; GFX10-FMA: ; %bb.0: ; GFX10-FMA-NEXT: s_clause 0x1 -; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FMA-NEXT: s_clause 0x1 ; GFX10-FMA-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-FMA-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-FMA-NEXT: global_load_dword v2, v0, s[2:3] ; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX10-FMA-NEXT: v_fmaak_f32 v1, v1, v2, 0x41200000 ; GFX10-FMA-NEXT: global_store_dword v0, v1, s[4:5] @@ -135,10 +132,8 @@ define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspac ; GFX11-FMA-LABEL: madak_f32: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -170,7 +165,7 @@ define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspac define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 { ; GFX6-LABEL: madak_2_use_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -195,7 +190,7 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad ; ; GFX8-LABEL: madak_2_use_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -225,48 +220,46 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad ; ; GFX9-LABEL: madak_2_use_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x41200000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] offset:8 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_madak_f32 v2, v1, v2, 0x41200000 ; GFX9-NEXT: v_mac_f32_e32 v4, v1, v3 -; GFX9-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-NEXT: global_store_dword v0, v2, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v0, v4, s[2:3] offset:4 +; GFX9-NEXT: global_store_dword v0, v4, s[6:7] offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; ; GFX10-MAD-LABEL: madak_2_use_f32: ; GFX10-MAD: ; %bb.0: -; GFX10-MAD-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-MAD-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-MAD-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) -; GFX10-MAD-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc +; GFX10-MAD-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc dlc ; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) -; GFX10-MAD-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc dlc +; GFX10-MAD-NEXT: global_load_dword v3, v0, s[6:7] offset:8 glc dlc ; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) ; GFX10-MAD-NEXT: v_madak_f32 v2, v1, v2, 0x41200000 ; GFX10-MAD-NEXT: v_madak_f32 v1, v1, v3, 0x41200000 -; GFX10-MAD-NEXT: global_store_dword v0, v2, s[0:1] +; GFX10-MAD-NEXT: global_store_dword v0, v2, s[4:5] ; GFX10-MAD-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-MAD-NEXT: global_store_dword v0, v1, s[2:3] offset:4 +; GFX10-MAD-NEXT: global_store_dword v0, v1, s[6:7] offset:4 ; GFX10-MAD-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-MAD-NEXT: s_endpgm ; ; GFX11-MAD-LABEL: madak_2_use_f32: ; GFX11-MAD: ; %bb.0: -; GFX11-MAD-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-MAD-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-MAD-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -276,9 +269,9 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad ; GFX11-MAD-NEXT: global_load_b32 v3, v0, s[2:3] offset:8 glc dlc ; GFX11-MAD-NEXT: s_waitcnt vmcnt(0) ; GFX11-MAD-NEXT: v_mul_f32_e32 v2, v1, v2 -; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-MAD-NEXT: v_dual_mul_f32 v1, v1, v3 :: v_dual_add_f32 v2, 0x41200000, v2 -; GFX11-MAD-NEXT: v_add_f32_e32 v1, 0x41200000, v1 +; GFX11-MAD-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-MAD-NEXT: v_dual_add_f32 v1, 0x41200000, v1 :: v_dual_add_f32 v2, 0x41200000, v2 ; GFX11-MAD-NEXT: global_store_b32 v0, v2, s[0:1] dlc ; GFX11-MAD-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[2:3] offset:4 dlc @@ -289,49 +282,46 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad ; ; GFX940-FMA-LABEL: madak_2_use_f32: ; GFX940-FMA: ; %bb.0: -; GFX940-FMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX940-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-FMA-NEXT: v_mov_b32_e32 v4, 0x41200000 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-FMA-NEXT: global_load_dword v1, v0, s[2:3] sc0 sc1 +; GFX940-FMA-NEXT: global_load_dword v1, v0, s[6:7] sc0 sc1 ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX940-FMA-NEXT: global_load_dword v2, v0, s[2:3] offset:4 sc0 sc1 +; GFX940-FMA-NEXT: global_load_dword v2, v0, s[6:7] offset:4 sc0 sc1 ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX940-FMA-NEXT: global_load_dword v3, v0, s[2:3] offset:8 sc0 sc1 +; GFX940-FMA-NEXT: global_load_dword v3, v0, s[6:7] offset:8 sc0 sc1 ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX940-FMA-NEXT: v_fmaak_f32 v2, v1, v2, 0x41200000 ; GFX940-FMA-NEXT: v_fmac_f32_e32 v4, v1, v3 -; GFX940-FMA-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-FMA-NEXT: global_store_dword v0, v2, s[4:5] sc0 sc1 ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX940-FMA-NEXT: global_store_dword v0, v4, s[2:3] offset:4 sc0 sc1 +; GFX940-FMA-NEXT: global_store_dword v0, v4, s[6:7] offset:4 sc0 sc1 ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX940-FMA-NEXT: s_endpgm ; ; GFX10-FMA-LABEL: madak_2_use_f32: ; GFX10-FMA: ; %bb.0: -; GFX10-FMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-FMA-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-FMA-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX10-FMA-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc +; GFX10-FMA-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc dlc ; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX10-FMA-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc dlc +; GFX10-FMA-NEXT: global_load_dword v3, v0, s[6:7] offset:8 glc dlc ; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX10-FMA-NEXT: v_fmaak_f32 v2, v1, v2, 0x41200000 ; GFX10-FMA-NEXT: v_fmaak_f32 v1, v1, v3, 0x41200000 -; GFX10-FMA-NEXT: global_store_dword v0, v2, s[0:1] +; GFX10-FMA-NEXT: global_store_dword v0, v2, s[4:5] ; GFX10-FMA-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-FMA-NEXT: global_store_dword v0, v1, s[2:3] offset:4 +; GFX10-FMA-NEXT: global_store_dword v0, v1, s[6:7] offset:4 ; GFX10-FMA-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-FMA-NEXT: s_endpgm ; ; GFX11-FMA-LABEL: madak_2_use_f32: ; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -375,7 +365,7 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in.a) #0 { ; GFX6-LABEL: madak_m_inline_imm_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -391,7 +381,7 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out, ; ; GFX8-LABEL: madak_m_inline_imm_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -408,36 +398,35 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out, ; ; GFX9-LABEL: madak_m_inline_imm_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_madak_f32 v1, 4.0, v1, 0x41200000 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-MAD-LABEL: madak_m_inline_imm_f32: ; GFX10-MAD: ; %bb.0: -; GFX10-MAD-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-MAD-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-MAD-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) ; GFX10-MAD-NEXT: v_madak_f32 v1, 4.0, v1, 0x41200000 -; GFX10-MAD-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-MAD-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-MAD-NEXT: s_endpgm ; ; GFX11-MAD-LABEL: madak_m_inline_imm_f32: ; GFX11-MAD: ; %bb.0: -; GFX11-MAD-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-MAD-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-MAD-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-MAD-NEXT: s_waitcnt vmcnt(0) ; GFX11-MAD-NEXT: v_mul_f32_e32 v1, 4.0, v1 +; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-MAD-NEXT: v_add_f32_e32 v1, 0x41200000, v1 ; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-MAD-NEXT: s_nop 0 @@ -446,32 +435,29 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out, ; ; GFX940-FMA-LABEL: madak_m_inline_imm_f32: ; GFX940-FMA: ; %bb.0: -; GFX940-FMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX940-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-FMA-NEXT: global_load_dword v1, v0, s[2:3] +; GFX940-FMA-NEXT: global_load_dword v1, v0, s[6:7] ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX940-FMA-NEXT: v_fmaak_f32 v1, 4.0, v1, 0x41200000 -; GFX940-FMA-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-FMA-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 ; GFX940-FMA-NEXT: s_endpgm ; ; GFX10-FMA-LABEL: madak_m_inline_imm_f32: ; GFX10-FMA: ; %bb.0: -; GFX10-FMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-FMA-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-FMA-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX10-FMA-NEXT: v_fmaak_f32 v1, 4.0, v1, 0x41200000 -; GFX10-FMA-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-FMA-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-FMA-NEXT: s_endpgm ; ; GFX11-FMA-LABEL: madak_m_inline_imm_f32: ; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3] @@ -498,18 +484,18 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out, define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in.a, ptr addrspace(1) noalias %in.b) #0 { ; GFX6-LABEL: madak_inline_imm_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX6-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NEXT: s_mov_b32 s10, 0 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX6-NEXT: s_mov_b64 s[0:1], s[6:7] ; GFX6-NEXT: v_mov_b32_e32 v1, 0 -; GFX6-NEXT: s_mov_b64 s[2:3], s[10:11] -; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 -; GFX6-NEXT: s_mov_b64 s[6:7], s[10:11] +; GFX6-NEXT: s_mov_b64 s[10:11], s[2:3] +; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 +; GFX6-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mad_f32 v2, v2, v3, 4.0 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 @@ -517,8 +503,8 @@ define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, p ; ; GFX8-LABEL: madak_inline_imm_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s7 @@ -539,12 +525,12 @@ define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, p ; ; GFX9-LABEL: madak_inline_imm_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mad_f32 v1, v1, v2, 4.0 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -553,13 +539,13 @@ define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, p ; GFX10-MAD-LABEL: madak_inline_imm_f32: ; GFX10-MAD: ; %bb.0: ; GFX10-MAD-NEXT: s_clause 0x1 -; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-MAD-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-MAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-MAD-NEXT: s_clause 0x1 ; GFX10-MAD-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-MAD-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-MAD-NEXT: global_load_dword v2, v0, s[2:3] ; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) ; GFX10-MAD-NEXT: v_mad_f32 v1, v1, v2, 4.0 ; GFX10-MAD-NEXT: global_store_dword v0, v1, s[4:5] @@ -568,10 +554,8 @@ define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, p ; GFX11-MAD-LABEL: madak_inline_imm_f32: ; GFX11-MAD: ; %bb.0: ; GFX11-MAD-NEXT: s_clause 0x1 -; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-MAD-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-MAD-NEXT: s_clause 0x1 @@ -588,13 +572,12 @@ define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, p ; ; GFX940-FMA-LABEL: madak_inline_imm_f32: ; GFX940-FMA: ; %bb.0: -; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-FMA-NEXT: global_load_dword v1, v0, s[6:7] -; GFX940-FMA-NEXT: global_load_dword v2, v0, s[0:1] +; GFX940-FMA-NEXT: global_load_dword v2, v0, s[2:3] ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX940-FMA-NEXT: v_fma_f32 v1, v1, v2, 4.0 ; GFX940-FMA-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 @@ -603,13 +586,13 @@ define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, p ; GFX10-FMA-LABEL: madak_inline_imm_f32: ; GFX10-FMA: ; %bb.0: ; GFX10-FMA-NEXT: s_clause 0x1 -; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FMA-NEXT: s_clause 0x1 ; GFX10-FMA-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-FMA-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-FMA-NEXT: global_load_dword v2, v0, s[2:3] ; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX10-FMA-NEXT: v_fma_f32 v1, v1, v2, 4.0 ; GFX10-FMA-NEXT: global_store_dword v0, v1, s[4:5] @@ -618,10 +601,8 @@ define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, p ; GFX11-FMA-LABEL: madak_inline_imm_f32: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -651,26 +632,26 @@ define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @s_v_madak_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in.a, float %b) #0 { ; GFX6-LABEL: s_v_madak_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX6-NEXT: s_load_dword s0, s[2:3], 0xd -; GFX6-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NEXT: s_mov_b32 s10, 0 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s8, s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX6-NEXT: s_mov_b64 s[0:1], s[6:7] ; GFX6-NEXT: v_mov_b32_e32 v1, 0 -; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 ; GFX6-NEXT: v_mov_b32_e32 v3, 0x41200000 -; GFX6-NEXT: s_mov_b64 s[6:7], s[10:11] +; GFX6-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mac_f32_e32 v3, s0, v2 +; GFX6-NEXT: v_mac_f32_e32 v3, s8, v2 ; GFX6-NEXT: buffer_store_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: s_v_madak_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s7 @@ -688,23 +669,22 @@ define amdgpu_kernel void @s_v_madak_f32(ptr addrspace(1) noalias %out, ptr addr ; ; GFX9-LABEL: s_v_madak_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x41200000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mac_f32_e32 v2, s0, v1 +; GFX9-NEXT: v_mac_f32_e32 v2, s2, v1 ; GFX9-NEXT: global_store_dword v0, v2, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-MAD-LABEL: s_v_madak_f32: ; GFX10-MAD: ; %bb.0: -; GFX10-MAD-NEXT: s_clause 0x1 -; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-MAD-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-MAD-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-MAD-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) @@ -715,15 +695,14 @@ define amdgpu_kernel void @s_v_madak_f32(ptr addrspace(1) noalias %out, ptr addr ; GFX11-MAD-LABEL: s_v_madak_f32: ; GFX11-MAD: ; %bb.0: ; GFX11-MAD-NEXT: s_clause 0x1 -; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-MAD-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX11-MAD-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-MAD-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[6:7] ; GFX11-MAD-NEXT: s_waitcnt vmcnt(0) ; GFX11-MAD-NEXT: v_mul_f32_e32 v1, s0, v1 +; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-MAD-NEXT: v_add_f32_e32 v1, 0x41200000, v1 ; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX11-MAD-NEXT: s_nop 0 @@ -732,24 +711,22 @@ define amdgpu_kernel void @s_v_madak_f32(ptr addrspace(1) noalias %out, ptr addr ; ; GFX940-FMA-LABEL: s_v_madak_f32: ; GFX940-FMA: ; %bb.0: -; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-FMA-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX940-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-FMA-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-FMA-NEXT: v_mov_b32_e32 v2, 0x41200000 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-FMA-NEXT: global_load_dword v1, v0, s[6:7] ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX940-FMA-NEXT: v_fmac_f32_e32 v2, s0, v1 +; GFX940-FMA-NEXT: v_fmac_f32_e32 v2, s2, v1 ; GFX940-FMA-NEXT: global_store_dword v0, v2, s[4:5] sc0 sc1 ; GFX940-FMA-NEXT: s_endpgm ; ; GFX10-FMA-LABEL: s_v_madak_f32: ; GFX10-FMA: ; %bb.0: -; GFX10-FMA-NEXT: s_clause 0x1 -; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-FMA-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-FMA-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FMA-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) @@ -760,10 +737,8 @@ define amdgpu_kernel void @s_v_madak_f32(ptr addrspace(1) noalias %out, ptr addr ; GFX11-FMA-LABEL: s_v_madak_f32: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FMA-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7] @@ -788,84 +763,82 @@ define amdgpu_kernel void @s_v_madak_f32(ptr addrspace(1) noalias %out, ptr addr define amdgpu_kernel void @v_s_madak_f32(ptr addrspace(1) noalias %out, float %a, ptr addrspace(1) noalias %in.b) #0 { ; GFX6-LABEL: v_s_madak_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX6-NEXT: v_mov_b32_e32 v1, 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: s_load_dword s0, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s2, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v3, 0x41200000 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mac_f32_e32 v3, s0, v2 +; GFX6-NEXT: v_mac_f32_e32 v3, s2, v2 ; GFX6-NEXT: buffer_store_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: v_s_madak_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX8-NEXT: s_load_dword s2, s[2:3], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x2c ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x41200000 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mac_f32_e32 v2, s2, v3 +; GFX8-NEXT: v_mac_f32_e32 v2, s0, v3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: v_s_madak_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x41200000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[0:1] -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mac_f32_e32 v2, s4, v1 -; GFX9-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-NEXT: global_store_dword v0, v2, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX10-MAD-LABEL: v_s_madak_f32: ; GFX10-MAD: ; %bb.0: -; GFX10-MAD-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-MAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-MAD-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-MAD-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-MAD-NEXT: s_clause 0x1 -; GFX10-MAD-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-MAD-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX10-MAD-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-MAD-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-MAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-MAD-NEXT: v_madak_f32 v1, s4, v1, 0x41200000 -; GFX10-MAD-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-MAD-NEXT: global_store_dword v0, v1, s[2:3] ; GFX10-MAD-NEXT: s_endpgm ; ; GFX11-MAD-LABEL: v_s_madak_f32: ; GFX11-MAD: ; %bb.0: -; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-MAD-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-MAD-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 ; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[0:1] +; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-MAD-NEXT: s_clause 0x1 -; GFX11-MAD-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-MAD-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-MAD-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-MAD-NEXT: v_mul_f32_e32 v1, s4, v1 +; GFX11-MAD-NEXT: v_mul_f32_e32 v1, s2, v1 ; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-MAD-NEXT: v_add_f32_e32 v1, 0x41200000, v1 ; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[0:1] @@ -875,47 +848,44 @@ define amdgpu_kernel void @v_s_madak_f32(ptr addrspace(1) noalias %out, float %a ; ; GFX940-FMA-LABEL: v_s_madak_f32: ; GFX940-FMA: ; %bb.0: -; GFX940-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-FMA-NEXT: v_mov_b32_e32 v2, 0x41200000 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-FMA-NEXT: global_load_dword v1, v0, s[0:1] -; GFX940-FMA-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX940-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-FMA-NEXT: global_load_dword v1, v0, s[2:3] +; GFX940-FMA-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX940-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-FMA-NEXT: v_fmac_f32_e32 v2, s4, v1 -; GFX940-FMA-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-FMA-NEXT: global_store_dword v0, v2, s[2:3] sc0 sc1 ; GFX940-FMA-NEXT: s_endpgm ; ; GFX10-FMA-LABEL: v_s_madak_f32: ; GFX10-FMA: ; %bb.0: -; GFX10-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-FMA-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-FMA-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-FMA-NEXT: s_clause 0x1 -; GFX10-FMA-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-FMA-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX10-FMA-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-FMA-NEXT: v_fmaak_f32 v1, s4, v1, 0x41200000 -; GFX10-FMA-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-FMA-NEXT: global_store_dword v0, v1, s[2:3] ; GFX10-FMA-NEXT: s_endpgm ; ; GFX11-FMA-LABEL: v_s_madak_f32: ; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 ; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[0:1] +; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-FMA-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-FMA-NEXT: v_fmaak_f32 v1, s4, v1, 0x41200000 +; GFX11-FMA-NEXT: v_fmaak_f32 v1, s2, v1, 0x41200000 ; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-FMA-NEXT: s_nop 0 ; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -935,7 +905,7 @@ define amdgpu_kernel void @v_s_madak_f32(ptr addrspace(1) noalias %out, float %a define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float %b) #0 { ; GFX6-LABEL: s_s_madak_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v0, 0x41200000 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 @@ -949,7 +919,7 @@ define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float ; ; GFX8-LABEL: s_s_madak_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x41200000 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s3 @@ -961,28 +931,28 @@ define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float ; ; GFX9-LABEL: s_s_madak_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x41200000 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NEXT: v_mac_f32_e32 v1, s2, v2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: v_mac_f32_e32 v1, s6, v2 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-MAD-LABEL: s_s_madak_f32: ; GFX10-MAD: ; %bb.0: -; GFX10-MAD-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-MAD-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-MAD-NEXT: v_mov_b32_e32 v0, s3 -; GFX10-MAD-NEXT: v_madak_f32 v0, s2, v0, 0x41200000 -; GFX10-MAD-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-MAD-NEXT: v_mov_b32_e32 v0, s7 +; GFX10-MAD-NEXT: v_madak_f32 v0, s6, v0, 0x41200000 +; GFX10-MAD-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-MAD-NEXT: s_endpgm ; ; GFX11-MAD-LABEL: s_s_madak_f32: ; GFX11-MAD: ; %bb.0: -; GFX11-MAD-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-MAD-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-MAD-NEXT: v_mul_f32_e64 v0, s2, s3 ; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -994,28 +964,28 @@ define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float ; ; GFX940-FMA-LABEL: s_s_madak_f32: ; GFX940-FMA: ; %bb.0: -; GFX940-FMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX940-FMA-NEXT: v_mov_b32_e32 v1, 0x41200000 ; GFX940-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-FMA-NEXT: v_mov_b32_e32 v2, s3 -; GFX940-FMA-NEXT: v_fmac_f32_e32 v1, s2, v2 -; GFX940-FMA-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-FMA-NEXT: v_mov_b32_e32 v2, s7 +; GFX940-FMA-NEXT: v_fmac_f32_e32 v1, s6, v2 +; GFX940-FMA-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 ; GFX940-FMA-NEXT: s_endpgm ; ; GFX10-FMA-LABEL: s_s_madak_f32: ; GFX10-FMA: ; %bb.0: -; GFX10-FMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-FMA-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-FMA-NEXT: v_mov_b32_e32 v0, s3 -; GFX10-FMA-NEXT: v_fmaak_f32 v0, s2, v0, 0x41200000 -; GFX10-FMA-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-FMA-NEXT: v_mov_b32_e32 v0, s7 +; GFX10-FMA-NEXT: v_fmaak_f32 v0, s6, v0, 0x41200000 +; GFX10-FMA-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-FMA-NEXT: s_endpgm ; ; GFX11-FMA-LABEL: s_s_madak_f32: ; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3 ; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1033,19 +1003,19 @@ define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in.a, ptr addrspace(1) noalias %in.b) #0 { ; GFX6-LABEL: no_madak_src0_modifier_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX6-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NEXT: s_mov_b32 s10, 0 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX6-NEXT: s_mov_b64 s[0:1], s[6:7] ; GFX6-NEXT: v_mov_b32_e32 v1, 0 -; GFX6-NEXT: s_mov_b64 s[2:3], s[10:11] -; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 +; GFX6-NEXT: s_mov_b64 s[10:11], s[2:3] +; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 ; GFX6-NEXT: s_mov_b32 s0, 0x41200000 -; GFX6-NEXT: s_mov_b64 s[6:7], s[10:11] +; GFX6-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mad_f32 v2, |v2|, v3, s0 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 @@ -1053,8 +1023,8 @@ define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias % ; ; GFX8-LABEL: no_madak_src0_modifier_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s7 @@ -1076,13 +1046,13 @@ define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias % ; ; GFX9-LABEL: no_madak_src0_modifier_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_mov_b32 s0, 0x41200000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] -; GFX9-NEXT: s_mov_b32 s0, 0x41200000 +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mad_f32 v1, |v1|, v2, s0 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -1091,13 +1061,13 @@ define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias % ; GFX10-MAD-LABEL: no_madak_src0_modifier_f32: ; GFX10-MAD: ; %bb.0: ; GFX10-MAD-NEXT: s_clause 0x1 -; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-MAD-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-MAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-MAD-NEXT: s_clause 0x1 ; GFX10-MAD-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-MAD-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-MAD-NEXT: global_load_dword v2, v0, s[2:3] ; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) ; GFX10-MAD-NEXT: v_mad_f32 v1, |v1|, v2, 0x41200000 ; GFX10-MAD-NEXT: global_store_dword v0, v1, s[4:5] @@ -1106,10 +1076,8 @@ define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias % ; GFX11-MAD-LABEL: no_madak_src0_modifier_f32: ; GFX11-MAD: ; %bb.0: ; GFX11-MAD-NEXT: s_clause 0x1 -; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-MAD-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-MAD-NEXT: s_clause 0x1 @@ -1126,14 +1094,13 @@ define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias % ; ; GFX940-FMA-LABEL: no_madak_src0_modifier_f32: ; GFX940-FMA: ; %bb.0: -; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX940-FMA-NEXT: s_mov_b32 s0, 0x41200000 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-FMA-NEXT: global_load_dword v1, v0, s[6:7] -; GFX940-FMA-NEXT: global_load_dword v2, v0, s[0:1] -; GFX940-FMA-NEXT: s_mov_b32 s0, 0x41200000 +; GFX940-FMA-NEXT: global_load_dword v2, v0, s[2:3] ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX940-FMA-NEXT: v_fma_f32 v1, |v1|, v2, s0 ; GFX940-FMA-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 @@ -1142,13 +1109,13 @@ define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias % ; GFX10-FMA-LABEL: no_madak_src0_modifier_f32: ; GFX10-FMA: ; %bb.0: ; GFX10-FMA-NEXT: s_clause 0x1 -; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FMA-NEXT: s_clause 0x1 ; GFX10-FMA-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-FMA-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-FMA-NEXT: global_load_dword v2, v0, s[2:3] ; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX10-FMA-NEXT: v_fma_f32 v1, |v1|, v2, 0x41200000 ; GFX10-FMA-NEXT: global_store_dword v0, v1, s[4:5] @@ -1157,10 +1124,8 @@ define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias % ; GFX11-FMA-LABEL: no_madak_src0_modifier_f32: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -1191,19 +1156,19 @@ define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias % define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in.a, ptr addrspace(1) noalias %in.b) #0 { ; GFX6-LABEL: no_madak_src1_modifier_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX6-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NEXT: s_mov_b32 s10, 0 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX6-NEXT: s_mov_b64 s[0:1], s[6:7] ; GFX6-NEXT: v_mov_b32_e32 v1, 0 -; GFX6-NEXT: s_mov_b64 s[2:3], s[10:11] -; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 +; GFX6-NEXT: s_mov_b64 s[10:11], s[2:3] +; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 ; GFX6-NEXT: s_mov_b32 s0, 0x41200000 -; GFX6-NEXT: s_mov_b64 s[6:7], s[10:11] +; GFX6-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mad_f32 v2, v2, |v3|, s0 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 @@ -1211,8 +1176,8 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias % ; ; GFX8-LABEL: no_madak_src1_modifier_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s7 @@ -1234,13 +1199,13 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias % ; ; GFX9-LABEL: no_madak_src1_modifier_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_mov_b32 s0, 0x41200000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] -; GFX9-NEXT: s_mov_b32 s0, 0x41200000 +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mad_f32 v1, v1, |v2|, s0 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -1249,13 +1214,13 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias % ; GFX10-MAD-LABEL: no_madak_src1_modifier_f32: ; GFX10-MAD: ; %bb.0: ; GFX10-MAD-NEXT: s_clause 0x1 -; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-MAD-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-MAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-MAD-NEXT: s_clause 0x1 ; GFX10-MAD-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-MAD-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-MAD-NEXT: global_load_dword v2, v0, s[2:3] ; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) ; GFX10-MAD-NEXT: v_mad_f32 v1, v1, |v2|, 0x41200000 ; GFX10-MAD-NEXT: global_store_dword v0, v1, s[4:5] @@ -1264,10 +1229,8 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias % ; GFX11-MAD-LABEL: no_madak_src1_modifier_f32: ; GFX11-MAD: ; %bb.0: ; GFX11-MAD-NEXT: s_clause 0x1 -; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-MAD-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-MAD-NEXT: s_clause 0x1 @@ -1284,14 +1247,13 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias % ; ; GFX940-FMA-LABEL: no_madak_src1_modifier_f32: ; GFX940-FMA: ; %bb.0: -; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX940-FMA-NEXT: s_mov_b32 s0, 0x41200000 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-FMA-NEXT: global_load_dword v1, v0, s[6:7] -; GFX940-FMA-NEXT: global_load_dword v2, v0, s[0:1] -; GFX940-FMA-NEXT: s_mov_b32 s0, 0x41200000 +; GFX940-FMA-NEXT: global_load_dword v2, v0, s[2:3] ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX940-FMA-NEXT: v_fma_f32 v1, v1, |v2|, s0 ; GFX940-FMA-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 @@ -1300,13 +1262,13 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias % ; GFX10-FMA-LABEL: no_madak_src1_modifier_f32: ; GFX10-FMA: ; %bb.0: ; GFX10-FMA-NEXT: s_clause 0x1 -; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FMA-NEXT: s_clause 0x1 ; GFX10-FMA-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-FMA-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-FMA-NEXT: global_load_dword v2, v0, s[2:3] ; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX10-FMA-NEXT: v_fma_f32 v1, v1, |v2|, 0x41200000 ; GFX10-FMA-NEXT: global_store_dword v0, v1, s[4:5] @@ -1315,10 +1277,8 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias % ; GFX11-FMA-LABEL: no_madak_src1_modifier_f32: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -1352,36 +1312,36 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias % define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], float %sgpr0, float %sgpr1) #0 { ; GFX6-LABEL: madak_constant_bus_violation: ; GFX6: ; %bb.0: ; %bb -; GFX6-NEXT: s_load_dword s0, s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_cmp_lg_u32 s0, 0 +; GFX6-NEXT: s_cmp_lg_u32 s2, 0 ; GFX6-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX6-NEXT: ; %bb.1: ; %bb3 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, 0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: .LBB9_2: ; %bb4 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_load_dword s0, s[2:3], 0x12 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0x12 ; GFX6-NEXT: v_mov_b32_e32 v1, 0x42280000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mac_f32_e64 v1, s0, 0.5 ; GFX6-NEXT: v_mul_f32_e32 v0, v1, v0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: madak_constant_bus_violation: ; GFX8: ; %bb.0: ; %bb -; GFX8-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_cmp_lg_u32 s0, 0 +; GFX8-NEXT: s_cmp_lg_u32 s2, 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX8-NEXT: ; %bb.1: ; %bb3 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 @@ -1390,7 +1350,7 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; GFX8-NEXT: .LBB9_2: ; %bb4 ; GFX8-NEXT: flat_load_dword v0, v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: s_load_dword s0, s[2:3], 0x48 +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x48 ; GFX8-NEXT: v_mov_b32_e32 v1, 0x42280000 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mac_f32_e64 v1, s0, 0.5 @@ -1401,9 +1361,9 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; ; GFX9-LABEL: madak_constant_bus_violation: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_cmp_lg_u32 s0, 0 +; GFX9-NEXT: s_cmp_lg_u32 s2, 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX9-NEXT: ; %bb.1: ; %bb3 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -1412,7 +1372,7 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; GFX9-NEXT: .LBB9_2: ; %bb4 ; GFX9-NEXT: global_load_dword v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x48 +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x48 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x42280000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mac_f32_e64 v1, s0, 0.5 @@ -1423,9 +1383,9 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; ; GFX10-MAD-LABEL: madak_constant_bus_violation: ; GFX10-MAD: ; %bb.0: ; %bb -; GFX10-MAD-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX10-MAD-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-MAD-NEXT: s_cmp_lg_u32 s0, 0 +; GFX10-MAD-NEXT: s_cmp_lg_u32 s2, 0 ; GFX10-MAD-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX10-MAD-NEXT: ; %bb.1: ; %bb3 ; GFX10-MAD-NEXT: v_mov_b32_e32 v0, 0 @@ -1434,7 +1394,7 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; GFX10-MAD-NEXT: .LBB9_2: ; %bb4 ; GFX10-MAD-NEXT: global_load_dword v0, v[0:1], off glc dlc ; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) -; GFX10-MAD-NEXT: s_load_dword s0, s[2:3], 0x48 +; GFX10-MAD-NEXT: s_load_dword s0, s[0:1], 0x48 ; GFX10-MAD-NEXT: v_mov_b32_e32 v1, 0.5 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-MAD-NEXT: v_madak_f32 v1, s0, v1, 0x42280000 @@ -1445,9 +1405,9 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; ; GFX11-MAD-LABEL: madak_constant_bus_violation: ; GFX11-MAD: ; %bb.0: ; %bb -; GFX11-MAD-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-MAD-NEXT: s_load_b32 s2, s[0:1], 0x24 ; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-MAD-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-MAD-NEXT: s_cmp_lg_u32 s2, 0 ; GFX11-MAD-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX11-MAD-NEXT: ; %bb.1: ; %bb3 ; GFX11-MAD-NEXT: v_mov_b32_e32 v0, 0 @@ -1456,7 +1416,7 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; GFX11-MAD-NEXT: .LBB9_2: ; %bb4 ; GFX11-MAD-NEXT: global_load_b32 v0, v[0:1], off glc dlc ; GFX11-MAD-NEXT: s_waitcnt vmcnt(0) -; GFX11-MAD-NEXT: s_load_b32 s0, s[2:3], 0x48 +; GFX11-MAD-NEXT: s_load_b32 s0, s[0:1], 0x48 ; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-MAD-NEXT: v_mul_f32_e64 v1, s0, 0.5 ; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1470,9 +1430,9 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; ; GFX940-FMA-LABEL: madak_constant_bus_violation: ; GFX940-FMA: ; %bb.0: ; %bb -; GFX940-FMA-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-FMA-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-FMA-NEXT: s_cmp_lg_u32 s0, 0 +; GFX940-FMA-NEXT: s_cmp_lg_u32 s2, 0 ; GFX940-FMA-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX940-FMA-NEXT: ; %bb.1: ; %bb3 ; GFX940-FMA-NEXT: v_mov_b32_e32 v0, 0 @@ -1481,7 +1441,7 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; GFX940-FMA-NEXT: .LBB9_2: ; %bb4 ; GFX940-FMA-NEXT: global_load_dword v0, v[0:1], off sc0 sc1 ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX940-FMA-NEXT: s_load_dword s0, s[2:3], 0x48 +; GFX940-FMA-NEXT: s_load_dword s0, s[0:1], 0x48 ; GFX940-FMA-NEXT: v_mov_b32_e32 v1, 0x42280000 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-FMA-NEXT: v_fmac_f32_e64 v1, s0, 0.5 @@ -1492,9 +1452,9 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; ; GFX10-FMA-LABEL: madak_constant_bus_violation: ; GFX10-FMA: ; %bb.0: ; %bb -; GFX10-FMA-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX10-FMA-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-FMA-NEXT: s_cmp_lg_u32 s0, 0 +; GFX10-FMA-NEXT: s_cmp_lg_u32 s2, 0 ; GFX10-FMA-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX10-FMA-NEXT: ; %bb.1: ; %bb3 ; GFX10-FMA-NEXT: v_mov_b32_e32 v0, 0 @@ -1503,7 +1463,7 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; GFX10-FMA-NEXT: .LBB9_2: ; %bb4 ; GFX10-FMA-NEXT: global_load_dword v0, v[0:1], off glc dlc ; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX10-FMA-NEXT: s_load_dword s0, s[2:3], 0x48 +; GFX10-FMA-NEXT: s_load_dword s0, s[0:1], 0x48 ; GFX10-FMA-NEXT: v_mov_b32_e32 v1, 0.5 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FMA-NEXT: v_fmaak_f32 v1, s0, v1, 0x42280000 @@ -1514,9 +1474,9 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; ; GFX11-FMA-LABEL: madak_constant_bus_violation: ; GFX11-FMA: ; %bb.0: ; %bb -; GFX11-FMA-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-FMA-NEXT: s_load_b32 s2, s[0:1], 0x24 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FMA-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-FMA-NEXT: s_cmp_lg_u32 s2, 0 ; GFX11-FMA-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX11-FMA-NEXT: ; %bb.1: ; %bb3 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 @@ -1525,7 +1485,7 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; GFX11-FMA-NEXT: .LBB9_2: ; %bb4 ; GFX11-FMA-NEXT: global_load_b32 v0, v[0:1], off glc dlc ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-FMA-NEXT: s_load_b32 s0, s[2:3], 0x48 +; GFX11-FMA-NEXT: s_load_b32 s0, s[0:1], 0x48 ; GFX11-FMA-NEXT: v_mov_b32_e32 v1, 0.5 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) diff --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll b/llvm/test/CodeGen/AMDGPU/memory_clause.ll index 3a065d518f0a9..86a5055ab0704 100644 --- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll +++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll @@ -5,26 +5,26 @@ define amdgpu_kernel void @vector_clause(ptr addrspace(1) noalias nocapture readonly %arg, ptr addrspace(1) noalias nocapture %arg1) { ; GCN-LABEL: vector_clause: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v16, 4, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] -; GCN-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16 -; GCN-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32 -; GCN-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48 +; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[4:5] +; GCN-NEXT: global_load_dwordx4 v[4:7], v16, s[4:5] offset:16 +; GCN-NEXT: global_load_dwordx4 v[8:11], v16, s[4:5] offset:32 +; GCN-NEXT: global_load_dwordx4 v[12:15], v16, s[4:5] offset:48 ; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] +; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:16 +; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16 ; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:32 +; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32 ; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:48 +; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48 ; GCN-NEXT: s_endpgm ; ; GCN-SCRATCH-LABEL: vector_clause: ; GCN-SCRATCH: ; %bb.0: ; %bb -; GCN-SCRATCH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-SCRATCH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-SCRATCH-NEXT: v_lshlrev_b32_e32 v16, 4, v0 ; GCN-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-SCRATCH-NEXT: s_clause 0x3 @@ -69,7 +69,7 @@ bb: define amdgpu_kernel void @scalar_clause(ptr addrspace(1) noalias nocapture readonly %arg, ptr addrspace(1) noalias nocapture %arg1) { ; GCN-LABEL: scalar_clause: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v16, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 @@ -98,7 +98,7 @@ define amdgpu_kernel void @scalar_clause(ptr addrspace(1) noalias nocapture read ; ; GCN-SCRATCH-LABEL: scalar_clause: ; GCN-SCRATCH: ; %bb.0: ; %bb -; GCN-SCRATCH-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 +; GCN-SCRATCH-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 ; GCN-SCRATCH-NEXT: v_mov_b32_e32 v16, 0 ; GCN-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-SCRATCH-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 @@ -250,11 +250,11 @@ bb: define amdgpu_kernel void @vector_clause_indirect(ptr addrspace(1) noalias nocapture readonly %arg, ptr addrspace(1) noalias nocapture readnone %arg1, ptr addrspace(1) noalias nocapture %arg2) { ; GCN-LABEL: vector_clause_indirect: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dwordx2 v[8:9], v0, s[0:1] +; GCN-NEXT: global_load_dwordx2 v[8:9], v0, s[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: global_load_dwordx4 v[0:3], v[8:9], off ; GCN-NEXT: global_load_dwordx4 v[4:7], v[8:9], off offset:16 @@ -267,20 +267,20 @@ define amdgpu_kernel void @vector_clause_indirect(ptr addrspace(1) noalias nocap ; ; GCN-SCRATCH-LABEL: vector_clause_indirect: ; GCN-SCRATCH: ; %bb.0: ; %bb -; GCN-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-SCRATCH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GCN-SCRATCH-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GCN-SCRATCH-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x34 +; GCN-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN-SCRATCH-NEXT: v_mov_b32_e32 v8, 0 ; GCN-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) -; GCN-SCRATCH-NEXT: global_load_dwordx2 v[4:5], v0, s[0:1] +; GCN-SCRATCH-NEXT: global_load_dwordx2 v[4:5], v0, s[2:3] ; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GCN-SCRATCH-NEXT: s_clause 0x1 ; GCN-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[4:5], off ; GCN-SCRATCH-NEXT: global_load_dwordx4 v[4:7], v[4:5], off offset:16 ; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(1) -; GCN-SCRATCH-NEXT: global_store_dwordx4 v8, v[0:3], s[2:3] +; GCN-SCRATCH-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GCN-SCRATCH-NEXT: global_store_dwordx4 v8, v[4:7], s[2:3] offset:16 +; GCN-SCRATCH-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 ; GCN-SCRATCH-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -384,10 +384,10 @@ define amdgpu_kernel void @flat_scratch_load(float %a, float %b, <8 x i32> %desc ; GCN-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 ; GCN-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 ; GCN-NEXT: s_mov_b32 s18, -1 +; GCN-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 ; GCN-NEXT: s_mov_b32 s19, 0xe00000 -; GCN-NEXT: s_add_u32 s16, s16, s9 -; GCN-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x24 -; GCN-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 +; GCN-NEXT: s_add_u32 s16, s16, s3 ; GCN-NEXT: s_addc_u32 s17, s17, 0 ; GCN-NEXT: v_mov_b32_e32 v0, 0x40b00000 ; GCN-NEXT: buffer_store_dword v0, off, s[16:19], 0 @@ -411,13 +411,13 @@ define amdgpu_kernel void @flat_scratch_load(float %a, float %b, <8 x i32> %desc ; ; GCN-SCRATCH-LABEL: flat_scratch_load: ; GCN-SCRATCH: ; %bb.0: ; %.entry -; GCN-SCRATCH-NEXT: s_add_u32 s6, s6, s11 -; GCN-SCRATCH-NEXT: s_addc_u32 s7, s7, 0 -; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; GCN-SCRATCH-NEXT: s_add_u32 s2, s2, s5 +; GCN-SCRATCH-NEXT: s_addc_u32 s3, s3, 0 +; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 ; GCN-SCRATCH-NEXT: s_clause 0x1 -; GCN-SCRATCH-NEXT: s_load_dwordx2 s[10:11], s[2:3], 0x24 -; GCN-SCRATCH-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x44 +; GCN-SCRATCH-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x24 +; GCN-SCRATCH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x44 ; GCN-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x40b00000 ; GCN-SCRATCH-NEXT: s_brev_b32 s8, 1 ; GCN-SCRATCH-NEXT: s_mov_b32 s9, s8 @@ -446,29 +446,29 @@ define amdgpu_kernel void @flat_scratch_load(float %a, float %b, <8 x i32> %desc %val = call <2 x float> @llvm.amdgcn.image.sample.2d.v2f32.f32(i32 9, float %a, float %b, <8 x i32> %desc, <4 x i32> , i1 false, i32 0, i32 0) %val0 = extractelement <2 x float> %val, i32 0 %valadd = fadd float %load, %val0 - call void @llvm.amdgcn.exp.f32(i32 0, i32 1, float %valadd, float undef, float undef, float undef, i1 true, i1 true) + call void @llvm.amdgcn.exp.f32(i32 immarg 0, i32 immarg 1, float %valadd, float undef, float undef, float undef, i1 immarg true, i1 immarg true) ret void } define amdgpu_kernel void @flat_scratch_load_clause(float %a, float %b, <8 x i32> %desc) { ; GCN-LABEL: flat_scratch_load_clause: ; GCN: ; %bb.0: ; %.entry -; GCN-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN-NEXT: s_mov_b32 s14, -1 -; GCN-NEXT: s_mov_b32 s15, 0xe00000 -; GCN-NEXT: s_add_u32 s12, s12, s9 -; GCN-NEXT: s_addc_u32 s13, s13, 0 +; GCN-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 +; GCN-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s7, 0xe00000 +; GCN-NEXT: s_add_u32 s4, s4, s3 +; GCN-NEXT: s_addc_u32 s5, s5, 0 ; GCN-NEXT: v_mov_b32_e32 v0, 0x40b00000 -; GCN-NEXT: buffer_store_dword v0, off, s[12:15], 0 +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, 0x40d00000 -; GCN-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; GCN-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 +; GCN-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; GCN-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_add_f32_e32 v0, v0, v1 ; GCN-NEXT: exp mrt0 v0, off, off, off done vm @@ -476,10 +476,10 @@ define amdgpu_kernel void @flat_scratch_load_clause(float %a, float %b, <8 x i32 ; ; GCN-SCRATCH-LABEL: flat_scratch_load_clause: ; GCN-SCRATCH: ; %bb.0: ; %.entry -; GCN-SCRATCH-NEXT: s_add_u32 s6, s6, s11 -; GCN-SCRATCH-NEXT: s_addc_u32 s7, s7, 0 -; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; GCN-SCRATCH-NEXT: s_add_u32 s2, s2, s5 +; GCN-SCRATCH-NEXT: s_addc_u32 s3, s3, 0 +; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 ; GCN-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x40b00000 ; GCN-SCRATCH-NEXT: v_mov_b32_e32 v1, 0x40d00000 ; GCN-SCRATCH-NEXT: scratch_store_dword off, v0, off @@ -504,7 +504,7 @@ define amdgpu_kernel void @flat_scratch_load_clause(float %a, float %b, <8 x i32 %load0 = load float, ptr addrspace(5) %alloca %load1 = load float, ptr addrspace(5) %alloca2 %valadd = fadd float %load0, %load1 - call void @llvm.amdgcn.exp.f32(i32 0, i32 1, float %valadd, float undef, float undef, float undef, i1 true, i1 true) + call void @llvm.amdgcn.exp.f32(i32 immarg 0, i32 immarg 1, float %valadd, float undef, float undef, float undef, i1 immarg true, i1 immarg true) ret void } diff --git a/llvm/test/CodeGen/AMDGPU/merge-s-load.mir b/llvm/test/CodeGen/AMDGPU/merge-s-load.mir index b08da2e1848ff..a87503c731c7a 100644 --- a/llvm/test/CodeGen/AMDGPU/merge-s-load.mir +++ b/llvm/test/CodeGen/AMDGPU/merge-s-load.mir @@ -8,9 +8,9 @@ body: | bb.0: ; CHECK-LABEL: name: merge_s_load_x1_x1 ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF - ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s64), align 4) - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_LOAD_DWORDX2_IMM]].sub0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_LOAD_DWORDX2_IMM]].sub1 + ; CHECK-NEXT: early-clobber %3:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s64), align 4) + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32_xm0_xexec = COPY %3.sub0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed %3.sub1 %0:sgpr_64 = IMPLICIT_DEF %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s32)) %2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 4, 0 :: (dereferenceable invariant load (s32)) @@ -48,16 +48,16 @@ body: | bb.0: ; GFX11-LABEL: name: merge_s_load_x1_x1_x1 ; GFX11: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF - ; GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s64), align 4) - ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_LOAD_DWORDX2_IMM]].sub0 - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_LOAD_DWORDX2_IMM]].sub1 + ; GFX11-NEXT: early-clobber %4:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s64), align 4) + ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32_xm0_xexec = COPY %4.sub0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed %4.sub1 ; GFX11-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 8, 0 :: (dereferenceable invariant load (s32)) ; ; GFX12-LABEL: name: merge_s_load_x1_x1_x1 ; GFX12: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF - ; GFX12-NEXT: [[S_LOAD_DWORDX3_IMM:%[0-9]+]]:sgpr_96 = S_LOAD_DWORDX3_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s96), align 4) - ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_64_xexec = COPY [[S_LOAD_DWORDX3_IMM]].sub0_sub1 - ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_LOAD_DWORDX3_IMM]].sub2 + ; GFX12-NEXT: early-clobber %5:sgpr_96 = S_LOAD_DWORDX3_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s96), align 4) + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_64_xexec = COPY %5.sub0_sub1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed %5.sub2 ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY]].sub0 ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY]].sub1 %0:sgpr_64 = IMPLICIT_DEF @@ -72,9 +72,9 @@ body: | bb.0: ; GFX11-LABEL: name: merge_s_load_x1_x1_x1_x1 ; GFX11: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF - ; GFX11-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64_xexec = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1 - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY killed [[S_LOAD_DWORDX4_IMM]].sub2_sub3 + ; GFX11-NEXT: early-clobber %7:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64_xexec = COPY %7.sub0_sub1 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY killed %7.sub2_sub3 ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY]].sub0 ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY]].sub1 ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY1]].sub0 @@ -82,9 +82,9 @@ body: | ; ; GFX12-LABEL: name: merge_s_load_x1_x1_x1_x1 ; GFX12: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF - ; GFX12-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_96 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1_sub2 - ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_LOAD_DWORDX4_IMM]].sub3 + ; GFX12-NEXT: early-clobber %7:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_96 = COPY %7.sub0_sub1_sub2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed %7.sub3 ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_64_xexec = COPY [[COPY]].sub0_sub1 ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY]].sub2 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY2]].sub0 @@ -102,9 +102,9 @@ body: | bb.0: ; GFX11-LABEL: name: merge_s_load_x1_x1_x1_x1_x1_x1_x1_x1 ; GFX11: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF - ; GFX11-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 4) - ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3 - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed [[S_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7 + ; GFX11-NEXT: early-clobber %15:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 4) + ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %15.sub0_sub1_sub2_sub3 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed %15.sub4_sub5_sub6_sub7 ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_64_xexec = COPY [[COPY]].sub0_sub1 ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY killed [[COPY]].sub2_sub3 ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY2]].sub0 @@ -120,9 +120,9 @@ body: | ; ; GFX12-LABEL: name: merge_s_load_x1_x1_x1_x1_x1_x1_x1_x1 ; GFX12: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF - ; GFX12-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 4) - ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3 - ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed [[S_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7 + ; GFX12-NEXT: early-clobber %15:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 4) + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %15.sub0_sub1_sub2_sub3 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed %15.sub4_sub5_sub6_sub7 ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_96 = COPY [[COPY]].sub0_sub1_sub2 ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY]].sub3 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_64_xexec = COPY [[COPY2]].sub0_sub1 @@ -157,9 +157,9 @@ body: | ; ; GFX12-LABEL: name: merge_s_load_x2_x1 ; GFX12: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF - ; GFX12-NEXT: [[S_LOAD_DWORDX3_IMM:%[0-9]+]]:sgpr_96 = S_LOAD_DWORDX3_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s96), align 8) - ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY [[S_LOAD_DWORDX3_IMM]].sub0_sub1 - ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_LOAD_DWORDX3_IMM]].sub2 + ; GFX12-NEXT: early-clobber %3:sgpr_96 = S_LOAD_DWORDX3_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s96), align 8) + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY %3.sub0_sub1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed %3.sub2 %0:sgpr_64 = IMPLICIT_DEF %1:sgpr_64 = S_LOAD_DWORDX2_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s64)) %2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 8, 0 :: (dereferenceable invariant load (s32)) @@ -171,9 +171,9 @@ body: | bb.0: ; CHECK-LABEL: name: merge_s_load_x2_x2 ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 8) - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_64 = COPY killed [[S_LOAD_DWORDX4_IMM]].sub2_sub3 + ; CHECK-NEXT: early-clobber %3:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 8) + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY %3.sub0_sub1 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_64 = COPY killed %3.sub2_sub3 %0:sgpr_64 = IMPLICIT_DEF %1:sgpr_64 = S_LOAD_DWORDX2_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s64)) %2:sgpr_64 = S_LOAD_DWORDX2_IMM %0:sgpr_64, 8, 0 :: (dereferenceable invariant load (s64)) @@ -185,9 +185,9 @@ body: | bb.0: ; CHECK-LABEL: name: merge_s_load_x2_x2_x2_x2 ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF - ; CHECK-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 8) - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed [[S_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7 + ; CHECK-NEXT: early-clobber %7:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 8) + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %7.sub0_sub1_sub2_sub3 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed %7.sub4_sub5_sub6_sub7 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_64 = COPY [[COPY]].sub0_sub1 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY killed [[COPY]].sub2_sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY [[COPY1]].sub0_sub1 @@ -205,9 +205,9 @@ body: | bb.0: ; CHECK-LABEL: name: merge_s_load_x3_x1 ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s128)) - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_96 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1_sub2 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_LOAD_DWORDX4_IMM]].sub3 + ; CHECK-NEXT: early-clobber %3:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s128)) + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_96 = COPY %3.sub0_sub1_sub2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed %3.sub3 %0:sgpr_64 = IMPLICIT_DEF %1:sgpr_96 = S_LOAD_DWORDX3_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s96)) %2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 12, 0 :: (dereferenceable invariant load (s32)) @@ -219,10 +219,118 @@ body: | bb.0: ; CHECK-LABEL: name: merge_s_load_x4_x4 ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF - ; CHECK-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 16) - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed [[S_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7 + ; CHECK-NEXT: early-clobber %3:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 16) + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %3.sub0_sub1_sub2_sub3 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed %3.sub4_sub5_sub6_sub7 %0:sgpr_64 = IMPLICIT_DEF %1:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s128)) %2:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 16, 0 :: (dereferenceable invariant load (s128)) ... + +# The constrained multi-dword scalar load merge tests. +--- +name: merge_s_load_x1_x2ec +body: | + bb.0: + ; CHECK-LABEL: name: merge_s_load_x1_x2ec + ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF + ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: early-clobber %2:sgpr_64 = S_LOAD_DWORDX2_IMM_ec [[DEF]], 4, 0 :: (dereferenceable invariant load (s64)) + %0:sgpr_64 = IMPLICIT_DEF + %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s32)) + early-clobber %2:sgpr_64 = S_LOAD_DWORDX2_IMM_ec %0:sgpr_64, 4, 0 :: (dereferenceable invariant load (s64)) +... + +--- +name: merge_s_load_x1_x3ec +body: | + bb.0: + ; CHECK-LABEL: name: merge_s_load_x1_x3ec + ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF + ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: early-clobber %2:sgpr_96 = S_LOAD_DWORDX3_IMM_ec [[DEF]], 4, 0 :: (dereferenceable invariant load (s96), align 16) + %0:sgpr_64 = IMPLICIT_DEF + %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s32)) + early-clobber %2:sgpr_96 = S_LOAD_DWORDX3_IMM_ec %0:sgpr_64, 4, 0 :: (dereferenceable invariant load (s96)) +... + +--- +name: merge_s_load_x2ec_x1 +body: | + bb.0: + ; GFX11-LABEL: name: merge_s_load_x2ec_x1 + ; GFX11: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF + ; GFX11-NEXT: early-clobber %1:sgpr_64 = S_LOAD_DWORDX2_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s64)) + ; GFX11-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 8, 0 :: (dereferenceable invariant load (s32)) + ; + ; GFX12-LABEL: name: merge_s_load_x2ec_x1 + ; GFX12: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF + ; GFX12-NEXT: early-clobber %3:sgpr_96 = S_LOAD_DWORDX3_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s96), align 8) + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY %3.sub0_sub1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed %3.sub2 + %0:sgpr_64 = IMPLICIT_DEF + early-clobber %1:sgpr_64 = S_LOAD_DWORDX2_IMM_ec %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s64)) + %2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 8, 0 :: (dereferenceable invariant load (s32)) +... + +--- +name: merge_s_load_x2ec_x2ec +body: | + bb.0: + ; CHECK-LABEL: name: merge_s_load_x2ec_x2ec + ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF + ; CHECK-NEXT: early-clobber %3:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 8) + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY %3.sub0_sub1 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_64 = COPY killed %3.sub2_sub3 + %0:sgpr_64 = IMPLICIT_DEF + early-clobber %1:sgpr_64 = S_LOAD_DWORDX2_IMM_ec %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s64)) + early-clobber %2:sgpr_64 = S_LOAD_DWORDX2_IMM_ec %0:sgpr_64, 8, 0 :: (dereferenceable invariant load (s64)) +... + +--- +name: merge_s_load_x2ec_x2ec_x2ec_x2ec +body: | + bb.0: + ; CHECK-LABEL: name: merge_s_load_x2ec_x2ec_x2ec_x2ec + ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF + ; CHECK-NEXT: early-clobber %7:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 8) + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %7.sub0_sub1_sub2_sub3 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed %7.sub4_sub5_sub6_sub7 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_64 = COPY [[COPY]].sub0_sub1 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY killed [[COPY]].sub2_sub3 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY [[COPY1]].sub0_sub1 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY killed [[COPY1]].sub2_sub3 + %0:sgpr_64 = IMPLICIT_DEF + early-clobber %1:sgpr_64 = S_LOAD_DWORDX2_IMM_ec %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s64)) + early-clobber %2:sgpr_64 = S_LOAD_DWORDX2_IMM_ec %0:sgpr_64, 8, 0 :: (dereferenceable invariant load (s64)) + early-clobber %3:sgpr_64 = S_LOAD_DWORDX2_IMM_ec %0:sgpr_64, 16, 0 :: (dereferenceable invariant load (s64)) + early-clobber %4:sgpr_64 = S_LOAD_DWORDX2_IMM_ec %0:sgpr_64, 24, 0 :: (dereferenceable invariant load (s64)) +... + +--- +name: merge_s_load_x3ec_x1 +body: | + bb.0: + ; CHECK-LABEL: name: merge_s_load_x3ec_x1 + ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF + ; CHECK-NEXT: early-clobber %3:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s128)) + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_96 = COPY %3.sub0_sub1_sub2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed %3.sub3 + %0:sgpr_64 = IMPLICIT_DEF + early-clobber %1:sgpr_96 = S_LOAD_DWORDX3_IMM_ec %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s96)) + %2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 12, 0 :: (dereferenceable invariant load (s32)) +... + +--- +name: merge_s_load_x4ec_x4ec +body: | + bb.0: + ; CHECK-LABEL: name: merge_s_load_x4ec_x4ec + ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF + ; CHECK-NEXT: early-clobber %3:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 16) + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %3.sub0_sub1_sub2_sub3 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed %3.sub4_sub5_sub6_sub7 + %0:sgpr_64 = IMPLICIT_DEF + early-clobber %1:sgpr_128 = S_LOAD_DWORDX4_IMM_ec %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s128)) + early-clobber %2:sgpr_128 = S_LOAD_DWORDX4_IMM_ec %0:sgpr_64, 16, 0 :: (dereferenceable invariant load (s128)) +... diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll index a77892c8f5fc7..9dafa27ece86f 100644 --- a/llvm/test/CodeGen/AMDGPU/min.ll +++ b/llvm/test/CodeGen/AMDGPU/min.ll @@ -31,8 +31,8 @@ define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrsp ; ; CI-LABEL: v_test_imin_sle_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -53,8 +53,8 @@ define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: v_test_imin_sle_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -75,12 +75,12 @@ define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_test_imin_sle_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_i32_e32 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] @@ -89,13 +89,13 @@ define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrsp ; GFX10-LABEL: v_test_imin_sle_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-NEXT: global_load_dword v2, v0, s[4:5] +; GFX10-NEXT: global_load_dword v2, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_min_i32_e32 v1, v1, v2 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] @@ -104,10 +104,8 @@ define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrsp ; GFX11-LABEL: v_test_imin_sle_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -145,7 +143,7 @@ define amdgpu_kernel void @s_test_imin_sle_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; CI-LABEL: s_test_imin_sle_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_i32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -156,7 +154,7 @@ define amdgpu_kernel void @s_test_imin_sle_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; VI-LABEL: s_test_imin_sle_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_i32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -167,7 +165,7 @@ define amdgpu_kernel void @s_test_imin_sle_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX9-LABEL: s_test_imin_sle_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_min_i32 s2, s2, s3 @@ -177,7 +175,7 @@ define amdgpu_kernel void @s_test_imin_sle_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX10-LABEL: s_test_imin_sle_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_min_i32 s2, s2, s3 @@ -187,7 +185,7 @@ define amdgpu_kernel void @s_test_imin_sle_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX11-LABEL: s_test_imin_sle_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_min_i32 s2, s2, s3 @@ -217,7 +215,7 @@ define amdgpu_kernel void @s_test_imin_sle_v1i32(ptr addrspace(1) %out, <1 x i32 ; ; CI-LABEL: s_test_imin_sle_v1i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_i32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -228,7 +226,7 @@ define amdgpu_kernel void @s_test_imin_sle_v1i32(ptr addrspace(1) %out, <1 x i32 ; ; VI-LABEL: s_test_imin_sle_v1i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_i32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -239,7 +237,7 @@ define amdgpu_kernel void @s_test_imin_sle_v1i32(ptr addrspace(1) %out, <1 x i32 ; ; GFX9-LABEL: s_test_imin_sle_v1i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_min_i32 s2, s2, s3 @@ -249,7 +247,7 @@ define amdgpu_kernel void @s_test_imin_sle_v1i32(ptr addrspace(1) %out, <1 x i32 ; ; GFX10-LABEL: s_test_imin_sle_v1i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_min_i32 s2, s2, s3 @@ -259,7 +257,7 @@ define amdgpu_kernel void @s_test_imin_sle_v1i32(ptr addrspace(1) %out, <1 x i32 ; ; GFX11-LABEL: s_test_imin_sle_v1i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_min_i32 s2, s2, s3 @@ -292,8 +290,8 @@ define amdgpu_kernel void @s_test_imin_sle_v4i32(ptr addrspace(1) %out, <4 x i32 ; ; CI-LABEL: s_test_imin_sle_v4i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x4 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x4 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_i32 s2, s11, s15 ; CI-NEXT: s_min_i32 s3, s10, s14 @@ -310,8 +308,8 @@ define amdgpu_kernel void @s_test_imin_sle_v4i32(ptr addrspace(1) %out, <4 x i32 ; ; VI-LABEL: s_test_imin_sle_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_i32 s2, s11, s15 ; VI-NEXT: s_min_i32 s3, s10, s14 @@ -328,8 +326,8 @@ define amdgpu_kernel void @s_test_imin_sle_v4i32(ptr addrspace(1) %out, <4 x i32 ; ; GFX9-LABEL: s_test_imin_sle_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_min_i32 s2, s11, s15 @@ -346,8 +344,8 @@ define amdgpu_kernel void @s_test_imin_sle_v4i32(ptr addrspace(1) %out, <4 x i32 ; GFX10-LABEL: s_test_imin_sle_v4i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_min_i32 s2, s11, s15 @@ -364,8 +362,8 @@ define amdgpu_kernel void @s_test_imin_sle_v4i32(ptr addrspace(1) %out, <4 x i32 ; GFX11-LABEL: s_test_imin_sle_v4i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x10 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x10 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_min_i32 s2, s7, s11 @@ -419,9 +417,9 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32], ; ; CI-LABEL: s_test_imin_sle_i8: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0xa -; CI-NEXT: s_load_dword s3, s[6:7], 0x13 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0xa +; CI-NEXT: s_load_dword s3, s[4:5], 0x13 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_sext_i32_i8 s2, s2 ; CI-NEXT: s_sext_i32_i8 s3, s3 @@ -434,9 +432,9 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32], ; ; VI-LABEL: s_test_imin_sle_i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x28 -; VI-NEXT: s_load_dword s3, s[6:7], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x28 +; VI-NEXT: s_load_dword s3, s[4:5], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_sext_i32_i8 s2, s2 ; VI-NEXT: s_sext_i32_i8 s3, s3 @@ -449,9 +447,9 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32], ; ; GFX9-LABEL: s_test_imin_sle_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x28 -; GFX9-NEXT: s_load_dword s3, s[6:7], 0x4c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x28 +; GFX9-NEXT: s_load_dword s3, s[4:5], 0x4c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sext_i32_i8 s2, s2 @@ -464,9 +462,9 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32], ; GFX10-LABEL: s_test_imin_sle_i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x28 -; GFX10-NEXT: s_load_dword s3, s[6:7], 0x4c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x28 +; GFX10-NEXT: s_load_dword s3, s[4:5], 0x4c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_sext_i32_i8 s2, s2 @@ -479,13 +477,13 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32], ; GFX11-LABEL: s_test_imin_sle_i8: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x28 -; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x4c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x28 +; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x4c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_sext_i32_i8 s2, s4 -; GFX11-NEXT: s_sext_i32_i8 s3, s5 +; GFX11-NEXT: s_sext_i32_i8 s2, s2 +; GFX11-NEXT: s_sext_i32_i8 s3, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_min_i32 s2, s2, s3 ; GFX11-NEXT: v_mov_b32_e32 v1, s2 @@ -556,9 +554,9 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32] ; ; CI-LABEL: s_test_imin_sle_v4i8: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0xa -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_load_dword s3, s[6:7], 0x13 +; CI-NEXT: s_load_dword s2, s[4:5], 0xa +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dword s3, s[4:5], 0x13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_ashr_i32 s4, s2, 24 ; CI-NEXT: s_sext_i32_i8 s5, s2 @@ -589,9 +587,9 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32] ; ; VI-LABEL: s_test_imin_sle_v4i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x28 -; VI-NEXT: s_load_dword s3, s[6:7], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x28 +; VI-NEXT: s_load_dword s3, s[4:5], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_lshrrev_b16_e64 v0, 8, s2 ; VI-NEXT: v_lshrrev_b16_e64 v1, 8, s3 @@ -618,9 +616,9 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32] ; ; GFX9-LABEL: s_test_imin_sle_v4i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x28 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s3, s[6:7], 0x4c +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x28 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s3, s[4:5], 0x4c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s4, s2, 16 @@ -646,9 +644,9 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32] ; GFX10-LABEL: s_test_imin_sle_v4i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x28 -; GFX10-NEXT: s_load_dword s3, s[6:7], 0x4c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x28 +; GFX10-NEXT: s_load_dword s3, s[4:5], 0x4c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_lshr_b32 s4, s2, 16 ; GFX10-NEXT: s_lshr_b32 s5, s3, 16 @@ -675,27 +673,29 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32] ; ; GFX11-LABEL: s_test_imin_sle_v4i8: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x28 -; GFX11-NEXT: s_load_b32 s1, s[2:3], 0x4c +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x28 +; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x4c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s4, s0, 16 -; GFX11-NEXT: s_lshr_b32 s5, s1, 16 -; GFX11-NEXT: v_ashrrev_i16 v0, 8, s0 -; GFX11-NEXT: v_ashrrev_i16 v1, 8, s1 +; GFX11-NEXT: s_lshr_b32 s4, s2, 16 +; GFX11-NEXT: s_lshr_b32 s5, s3, 16 +; GFX11-NEXT: v_ashrrev_i16 v0, 8, s2 +; GFX11-NEXT: v_ashrrev_i16 v1, 8, s3 ; GFX11-NEXT: v_ashrrev_i16 v2, 8, s4 ; GFX11-NEXT: v_ashrrev_i16 v3, 8, s5 -; GFX11-NEXT: s_bfe_i32 s0, s0, 0x80000 -; GFX11-NEXT: s_bfe_i32 s1, s1, 0x80000 +; GFX11-NEXT: s_bfe_i32 s2, s2, 0x80000 +; GFX11-NEXT: s_bfe_i32 s3, s3, 0x80000 ; GFX11-NEXT: s_bfe_i32 s4, s4, 0x80000 ; GFX11-NEXT: s_bfe_i32 s5, s5, 0x80000 -; GFX11-NEXT: v_min_i16 v4, s0, s1 +; GFX11-NEXT: v_min_i16 v4, s2, s3 ; GFX11-NEXT: v_min_i16 v5, s4, s5 ; GFX11-NEXT: v_min_i16 v2, v2, v3 ; GFX11-NEXT: v_min_i16 v0, v0, v1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v4 ; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_lshlrev_b16 v2, 8, v2 ; GFX11-NEXT: v_lshlrev_b16 v0, 8, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -707,7 +707,6 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32] ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -752,7 +751,7 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16 ; ; CI-LABEL: s_test_imin_sle_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_ashr_i32 s4, s2, 16 ; CI-NEXT: s_sext_i32_i16 s2, s2 @@ -771,7 +770,7 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16 ; ; VI-LABEL: s_test_imin_sle_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s4, s2, 16 ; VI-NEXT: s_sext_i32_i16 s2, s2 @@ -790,7 +789,7 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16 ; ; GFX9-LABEL: s_test_imin_sle_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 @@ -800,7 +799,7 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16 ; ; GFX10-LABEL: s_test_imin_sle_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_pk_min_i16 v1, s2, s3 @@ -809,7 +808,7 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16 ; ; GFX11-LABEL: s_test_imin_sle_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_min_i16 v1, s2, s3 @@ -904,8 +903,8 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16 ; ; CI-LABEL: s_test_imin_sle_v4i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_ashr_i32 s6, s0, 16 ; CI-NEXT: s_ashr_i32 s7, s1, 16 @@ -934,8 +933,8 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16 ; ; VI-LABEL: s_test_imin_sle_v4i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s6, s1, 16 ; VI-NEXT: s_sext_i32_i16 s1, s1 @@ -964,34 +963,34 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16 ; ; GFX9-LABEL: s_test_imin_sle_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: v_pk_min_i16 v1, s1, v0 ; GFX9-NEXT: v_pk_min_i16 v0, s0, v3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_test_imin_sle_v4i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_pk_min_i16 v1, s1, s3 ; GFX10-NEXT: v_pk_min_i16 v0, s0, s2 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_imin_sle_v4i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_min_i16 v1, s5, s7 @@ -1031,8 +1030,8 @@ define amdgpu_kernel void @v_test_imin_slt_i32(ptr addrspace(1) %out, ptr addrsp ; ; CI-LABEL: v_test_imin_slt_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1053,8 +1052,8 @@ define amdgpu_kernel void @v_test_imin_slt_i32(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: v_test_imin_slt_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1075,12 +1074,12 @@ define amdgpu_kernel void @v_test_imin_slt_i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_test_imin_slt_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_i32_e32 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] @@ -1089,13 +1088,13 @@ define amdgpu_kernel void @v_test_imin_slt_i32(ptr addrspace(1) %out, ptr addrsp ; GFX10-LABEL: v_test_imin_slt_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-NEXT: global_load_dword v2, v0, s[4:5] +; GFX10-NEXT: global_load_dword v2, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_min_i32_e32 v1, v1, v2 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] @@ -1104,10 +1103,8 @@ define amdgpu_kernel void @v_test_imin_slt_i32(ptr addrspace(1) %out, ptr addrsp ; GFX11-LABEL: v_test_imin_slt_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -1172,8 +1169,8 @@ define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrsp ; ; CI-LABEL: v_test_imin_slt_i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 1, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1194,8 +1191,8 @@ define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: v_test_imin_slt_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1216,12 +1213,12 @@ define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_test_imin_slt_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] -; GFX9-NEXT: global_load_ushort v2, v0, s[4:5] +; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_i16_e32 v1, v1, v2 ; GFX9-NEXT: global_store_short v0, v1, s[0:1] @@ -1230,13 +1227,13 @@ define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrsp ; GFX10-LABEL: v_test_imin_slt_i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] -; GFX10-NEXT: global_load_ushort v2, v0, s[4:5] +; GFX10-NEXT: global_load_ushort v2, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_min_i16 v1, v1, v2 ; GFX10-NEXT: global_store_short v0, v1, s[0:1] @@ -1245,10 +1242,8 @@ define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrsp ; GFX11-LABEL: v_test_imin_slt_i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -1287,7 +1282,7 @@ define amdgpu_kernel void @s_test_imin_slt_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; CI-LABEL: s_test_imin_slt_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_i32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -1298,7 +1293,7 @@ define amdgpu_kernel void @s_test_imin_slt_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; VI-LABEL: s_test_imin_slt_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_i32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1309,7 +1304,7 @@ define amdgpu_kernel void @s_test_imin_slt_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX9-LABEL: s_test_imin_slt_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_min_i32 s2, s2, s3 @@ -1319,7 +1314,7 @@ define amdgpu_kernel void @s_test_imin_slt_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX10-LABEL: s_test_imin_slt_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_min_i32 s2, s2, s3 @@ -1329,7 +1324,7 @@ define amdgpu_kernel void @s_test_imin_slt_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX11-LABEL: s_test_imin_slt_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_min_i32 s2, s2, s3 @@ -1360,8 +1355,8 @@ define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32 ; ; CI-LABEL: s_test_imin_slt_v2i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_i32 s1, s1, s3 ; CI-NEXT: s_min_i32 s0, s0, s2 @@ -1374,8 +1369,8 @@ define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32 ; ; VI-LABEL: s_test_imin_slt_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_i32 s1, s1, s3 ; VI-NEXT: s_min_i32 s0, s0, s2 @@ -1388,36 +1383,36 @@ define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32 ; ; GFX9-LABEL: s_test_imin_slt_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_min_i32 s1, s1, s3 ; GFX9-NEXT: s_min_i32 s0, s0, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_test_imin_slt_v2i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_min_i32 s0, s0, s2 ; GFX10-NEXT: s_min_i32 s1, s1, s3 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_imin_slt_v2i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_min_i32 s2, s4, s6 @@ -1448,8 +1443,8 @@ define amdgpu_kernel void @s_test_imin_slt_imm_i32(ptr addrspace(1) %out, i32 %a ; ; CI-LABEL: s_test_imin_slt_imm_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_i32 s2, s2, 8 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -1460,8 +1455,8 @@ define amdgpu_kernel void @s_test_imin_slt_imm_i32(ptr addrspace(1) %out, i32 %a ; ; VI-LABEL: s_test_imin_slt_imm_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_i32 s2, s2, 8 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1472,8 +1467,8 @@ define amdgpu_kernel void @s_test_imin_slt_imm_i32(ptr addrspace(1) %out, i32 %a ; ; GFX9-LABEL: s_test_imin_slt_imm_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_min_i32 s2, s2, 8 @@ -1484,8 +1479,8 @@ define amdgpu_kernel void @s_test_imin_slt_imm_i32(ptr addrspace(1) %out, i32 %a ; GFX10-LABEL: s_test_imin_slt_imm_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_min_i32 s2, s2, 8 @@ -1496,11 +1491,11 @@ define amdgpu_kernel void @s_test_imin_slt_imm_i32(ptr addrspace(1) %out, i32 %a ; GFX11-LABEL: s_test_imin_slt_imm_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_min_i32 s2, s4, 8 +; GFX11-NEXT: s_min_i32 s2, s2, 8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1527,8 +1522,8 @@ define amdgpu_kernel void @s_test_imin_sle_imm_i32(ptr addrspace(1) %out, i32 %a ; ; CI-LABEL: s_test_imin_sle_imm_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_i32 s2, s2, 8 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -1539,8 +1534,8 @@ define amdgpu_kernel void @s_test_imin_sle_imm_i32(ptr addrspace(1) %out, i32 %a ; ; VI-LABEL: s_test_imin_sle_imm_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_i32 s2, s2, 8 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1551,8 +1546,8 @@ define amdgpu_kernel void @s_test_imin_sle_imm_i32(ptr addrspace(1) %out, i32 %a ; ; GFX9-LABEL: s_test_imin_sle_imm_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_min_i32 s2, s2, 8 @@ -1563,8 +1558,8 @@ define amdgpu_kernel void @s_test_imin_sle_imm_i32(ptr addrspace(1) %out, i32 %a ; GFX10-LABEL: s_test_imin_sle_imm_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_min_i32 s2, s2, 8 @@ -1575,11 +1570,11 @@ define amdgpu_kernel void @s_test_imin_sle_imm_i32(ptr addrspace(1) %out, i32 %a ; GFX11-LABEL: s_test_imin_sle_imm_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_min_i32 s2, s4, 8 +; GFX11-NEXT: s_min_i32 s2, s2, 8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1617,8 +1612,8 @@ define amdgpu_kernel void @v_test_umin_ule_i32(ptr addrspace(1) %out, ptr addrsp ; ; CI-LABEL: v_test_umin_ule_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1639,8 +1634,8 @@ define amdgpu_kernel void @v_test_umin_ule_i32(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: v_test_umin_ule_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1661,12 +1656,12 @@ define amdgpu_kernel void @v_test_umin_ule_i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_test_umin_ule_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_u32_e32 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] @@ -1675,13 +1670,13 @@ define amdgpu_kernel void @v_test_umin_ule_i32(ptr addrspace(1) %out, ptr addrsp ; GFX10-LABEL: v_test_umin_ule_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-NEXT: global_load_dword v2, v0, s[4:5] +; GFX10-NEXT: global_load_dword v2, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_min_u32_e32 v1, v1, v2 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] @@ -1690,10 +1685,8 @@ define amdgpu_kernel void @v_test_umin_ule_i32(ptr addrspace(1) %out, ptr addrsp ; GFX11-LABEL: v_test_umin_ule_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -1748,8 +1741,8 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addr ; ; CI-LABEL: v_test_umin_ule_v3i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v6, 4, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1772,8 +1765,8 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addr ; ; VI-LABEL: v_test_umin_ule_v3i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v6, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1796,12 +1789,12 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: v_test_umin_ule_v3i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 4, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx3 v[0:2], v6, s[2:3] -; GFX9-NEXT: global_load_dwordx3 v[3:5], v6, s[4:5] +; GFX9-NEXT: global_load_dwordx3 v[3:5], v6, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_u32_e32 v2, v2, v5 ; GFX9-NEXT: v_min_u32_e32 v1, v1, v4 @@ -1812,13 +1805,13 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addr ; GFX10-LABEL: v_test_umin_ule_v3i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 4, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx3 v[0:2], v6, s[2:3] -; GFX10-NEXT: global_load_dwordx3 v[3:5], v6, s[4:5] +; GFX10-NEXT: global_load_dwordx3 v[3:5], v6, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_min_u32_e32 v2, v2, v5 ; GFX10-NEXT: v_min_u32_e32 v1, v1, v4 @@ -1829,10 +1822,8 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addr ; GFX11-LABEL: v_test_umin_ule_v3i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 4, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -1911,8 +1902,8 @@ define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addr ; ; CI-LABEL: v_test_umin_ule_v3i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1947,8 +1938,8 @@ define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addr ; ; VI-LABEL: v_test_umin_ule_v3i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1975,12 +1966,12 @@ define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: v_test_umin_ule_v3i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_min_u16 v1, v1, v3 ; GFX9-NEXT: v_pk_min_u16 v0, v0, v2 @@ -1991,13 +1982,13 @@ define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addr ; GFX10-LABEL: v_test_umin_ule_v3i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_min_u16 v1, v1, v3 ; GFX10-NEXT: v_pk_min_u16 v0, v0, v2 @@ -2008,10 +1999,8 @@ define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addr ; GFX11-LABEL: v_test_umin_ule_v3i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -2053,7 +2042,7 @@ define amdgpu_kernel void @s_test_umin_ule_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; CI-LABEL: s_test_umin_ule_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_u32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -2064,7 +2053,7 @@ define amdgpu_kernel void @s_test_umin_ule_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; VI-LABEL: s_test_umin_ule_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_u32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2075,7 +2064,7 @@ define amdgpu_kernel void @s_test_umin_ule_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX9-LABEL: s_test_umin_ule_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_min_u32 s2, s2, s3 @@ -2085,7 +2074,7 @@ define amdgpu_kernel void @s_test_umin_ule_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX10-LABEL: s_test_umin_ule_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_min_u32 s2, s2, s3 @@ -2095,7 +2084,7 @@ define amdgpu_kernel void @s_test_umin_ule_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX11-LABEL: s_test_umin_ule_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_min_u32 s2, s2, s3 @@ -2136,8 +2125,8 @@ define amdgpu_kernel void @v_test_umin_ult_i32(ptr addrspace(1) %out, ptr addrsp ; ; CI-LABEL: v_test_umin_ult_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2158,8 +2147,8 @@ define amdgpu_kernel void @v_test_umin_ult_i32(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: v_test_umin_ult_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2180,12 +2169,12 @@ define amdgpu_kernel void @v_test_umin_ult_i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_test_umin_ult_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_u32_e32 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] @@ -2194,13 +2183,13 @@ define amdgpu_kernel void @v_test_umin_ult_i32(ptr addrspace(1) %out, ptr addrsp ; GFX10-LABEL: v_test_umin_ult_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-NEXT: global_load_dword v2, v0, s[4:5] +; GFX10-NEXT: global_load_dword v2, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_min_u32_e32 v1, v1, v2 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] @@ -2209,10 +2198,8 @@ define amdgpu_kernel void @v_test_umin_ult_i32(ptr addrspace(1) %out, ptr addrsp ; GFX11-LABEL: v_test_umin_ult_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -2268,8 +2255,8 @@ define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspa ; ; CI-LABEL: v_test_umin_ult_i8: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v2, s3 ; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v0 @@ -2289,8 +2276,8 @@ define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspa ; ; VI-LABEL: v_test_umin_ult_i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s3 ; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v0 @@ -2310,11 +2297,11 @@ define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: v_test_umin_ult_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3] -; GFX9-NEXT: global_load_ubyte v2, v0, s[4:5] +; GFX9-NEXT: global_load_ubyte v2, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_u16_e32 v1, v1, v2 ; GFX9-NEXT: global_store_byte v0, v1, s[0:1] @@ -2323,12 +2310,12 @@ define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspa ; GFX10-LABEL: v_test_umin_ult_i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] -; GFX10-NEXT: global_load_ubyte v2, v0, s[4:5] +; GFX10-NEXT: global_load_ubyte v2, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_min_u16 v1, v1, v2 ; GFX10-NEXT: global_store_byte v0, v1, s[0:1] @@ -2337,9 +2324,8 @@ define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspa ; GFX11-LABEL: v_test_umin_ult_i8: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_u8 v1, v0, s[6:7] @@ -2377,7 +2363,7 @@ define amdgpu_kernel void @s_test_umin_ult_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; CI-LABEL: s_test_umin_ult_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_u32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -2388,7 +2374,7 @@ define amdgpu_kernel void @s_test_umin_ult_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; VI-LABEL: s_test_umin_ult_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_u32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2399,7 +2385,7 @@ define amdgpu_kernel void @s_test_umin_ult_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX9-LABEL: s_test_umin_ult_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_min_u32 s2, s2, s3 @@ -2409,7 +2395,7 @@ define amdgpu_kernel void @s_test_umin_ult_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX10-LABEL: s_test_umin_ult_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_min_u32 s2, s2, s3 @@ -2419,7 +2405,7 @@ define amdgpu_kernel void @s_test_umin_ult_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX11-LABEL: s_test_umin_ult_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_min_u32 s2, s2, s3 @@ -2471,7 +2457,7 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0, ; ; CI-LABEL: v_test_umin_ult_i32_multi_use: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s4, s[4:5], 0x0 ; CI-NEXT: s_load_dword s5, s[6:7], 0x0 @@ -2492,7 +2478,7 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0, ; ; VI-LABEL: v_test_umin_ult_i32_multi_use: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s4, s[4:5], 0x0 ; VI-NEXT: s_load_dword s5, s[6:7], 0x0 @@ -2513,7 +2499,7 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0, ; ; GFX9-LABEL: v_test_umin_ult_i32_multi_use: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s8, s[4:5], 0x0 @@ -2531,7 +2517,7 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0, ; ; GFX10-LABEL: v_test_umin_ult_i32_multi_use: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dword s8, s[4:5], 0x0 @@ -2549,7 +2535,7 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0, ; ; GFX11-LABEL: v_test_umin_ult_i32_multi_use: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x0 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x0 @@ -2621,7 +2607,7 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0, ; ; CI-LABEL: v_test_umin_ult_i16_multi_use: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: v_mov_b32_e32 v1, s5 @@ -2643,7 +2629,7 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0, ; ; VI-LABEL: v_test_umin_ult_i16_multi_use: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -2665,7 +2651,7 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0, ; ; GFX9-LABEL: v_test_umin_ult_i16_multi_use: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[4:5] @@ -2680,7 +2666,7 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0, ; ; GFX10-LABEL: v_test_umin_ult_i16_multi_use: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 @@ -2696,7 +2682,7 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0, ; ; GFX11-LABEL: v_test_umin_ult_i16_multi_use: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x0 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -2735,7 +2721,7 @@ define amdgpu_kernel void @s_test_umin_ult_v1i32(ptr addrspace(1) %out, <1 x i32 ; ; CI-LABEL: s_test_umin_ult_v1i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_u32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -2746,7 +2732,7 @@ define amdgpu_kernel void @s_test_umin_ult_v1i32(ptr addrspace(1) %out, <1 x i32 ; ; VI-LABEL: s_test_umin_ult_v1i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_u32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2757,7 +2743,7 @@ define amdgpu_kernel void @s_test_umin_ult_v1i32(ptr addrspace(1) %out, <1 x i32 ; ; GFX9-LABEL: s_test_umin_ult_v1i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_min_u32 s2, s2, s3 @@ -2767,7 +2753,7 @@ define amdgpu_kernel void @s_test_umin_ult_v1i32(ptr addrspace(1) %out, <1 x i32 ; ; GFX10-LABEL: s_test_umin_ult_v1i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_min_u32 s2, s2, s3 @@ -2777,7 +2763,7 @@ define amdgpu_kernel void @s_test_umin_ult_v1i32(ptr addrspace(1) %out, <1 x i32 ; ; GFX11-LABEL: s_test_umin_ult_v1i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_min_u32 s2, s2, s3 @@ -2818,8 +2804,8 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32 ; ; CI-LABEL: s_test_umin_ult_v8i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x8 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x8 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_u32 s4, s11, s19 ; CI-NEXT: s_min_u32 s5, s10, s18 @@ -2849,8 +2835,8 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32 ; ; VI-LABEL: s_test_umin_ult_v8i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x20 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x20 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_u32 s4, s11, s19 ; VI-NEXT: s_min_u32 s5, s10, s18 @@ -2880,8 +2866,8 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32 ; ; GFX9-LABEL: s_test_umin_ult_v8i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x20 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x20 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_min_u32 s4, s9, s17 @@ -2908,8 +2894,8 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32 ; GFX10-LABEL: s_test_umin_ult_v8i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x20 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x20 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_min_u32 s4, s9, s17 @@ -2935,8 +2921,8 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32 ; GFX11-LABEL: s_test_umin_ult_v8i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b512 s[4:19], s[2:3], 0x20 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b512 s[4:19], s[0:1], 0x20 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v8, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_min_u32 s2, s7, s15 @@ -3109,8 +3095,8 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16 ; ; CI-LABEL: s_test_umin_ult_v8i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x4 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x4 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s2, s8, 16 ; CI-NEXT: s_and_b32 s3, s8, 0xffff @@ -3155,8 +3141,8 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16 ; ; VI-LABEL: s_test_umin_ult_v8i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s2, s11, 16 ; VI-NEXT: s_lshr_b32 s4, s10, 16 @@ -3201,8 +3187,8 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16 ; ; GFX9-LABEL: s_test_umin_ult_v8i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s15 @@ -3219,8 +3205,8 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16 ; GFX10-LABEL: s_test_umin_ult_v8i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_pk_min_u16 v3, s11, s15 @@ -3233,8 +3219,8 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16 ; GFX11-LABEL: s_test_umin_ult_v8i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x10 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x10 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_min_u16 v3, s7, s11 @@ -3277,9 +3263,9 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(ptr addrspac ; ; CI-LABEL: simplify_demanded_bits_test_umin_ult_i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0xa -; CI-NEXT: s_load_dword s3, s[6:7], 0x13 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0xa +; CI-NEXT: s_load_dword s3, s[4:5], 0x13 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s2, s2, 0xffff ; CI-NEXT: s_and_b32 s3, s3, 0xffff @@ -3292,9 +3278,9 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(ptr addrspac ; ; VI-LABEL: simplify_demanded_bits_test_umin_ult_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x28 -; VI-NEXT: s_load_dword s3, s[6:7], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x28 +; VI-NEXT: s_load_dword s3, s[4:5], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s2, s2, 0xffff ; VI-NEXT: s_and_b32 s3, s3, 0xffff @@ -3307,9 +3293,9 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(ptr addrspac ; ; GFX9-LABEL: simplify_demanded_bits_test_umin_ult_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x28 -; GFX9-NEXT: s_load_dword s3, s[6:7], 0x4c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x28 +; GFX9-NEXT: s_load_dword s3, s[4:5], 0x4c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0xffff @@ -3322,9 +3308,9 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(ptr addrspac ; GFX10-LABEL: simplify_demanded_bits_test_umin_ult_i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x28 -; GFX10-NEXT: s_load_dword s3, s[6:7], 0x4c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x28 +; GFX10-NEXT: s_load_dword s3, s[4:5], 0x4c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s2, s2, 0xffff @@ -3337,13 +3323,13 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(ptr addrspac ; GFX11-LABEL: simplify_demanded_bits_test_umin_ult_i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x28 -; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x4c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x28 +; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x4c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0xffff -; GFX11-NEXT: s_and_b32 s3, s5, 0xffff +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_and_b32 s3, s3, 0xffff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_min_u32 s2, s2, s3 ; GFX11-NEXT: v_mov_b32_e32 v1, s2 @@ -3386,9 +3372,9 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace ; ; CI-LABEL: simplify_demanded_bits_test_min_slt_i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0xa -; CI-NEXT: s_load_dword s3, s[6:7], 0x13 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0xa +; CI-NEXT: s_load_dword s3, s[4:5], 0x13 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_sext_i32_i16 s2, s2 ; CI-NEXT: s_sext_i32_i16 s3, s3 @@ -3401,9 +3387,9 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace ; ; VI-LABEL: simplify_demanded_bits_test_min_slt_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x28 -; VI-NEXT: s_load_dword s3, s[6:7], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x28 +; VI-NEXT: s_load_dword s3, s[4:5], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_sext_i32_i16 s2, s2 ; VI-NEXT: s_sext_i32_i16 s3, s3 @@ -3416,9 +3402,9 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace ; ; GFX9-LABEL: simplify_demanded_bits_test_min_slt_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x28 -; GFX9-NEXT: s_load_dword s3, s[6:7], 0x4c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x28 +; GFX9-NEXT: s_load_dword s3, s[4:5], 0x4c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sext_i32_i16 s2, s2 @@ -3431,9 +3417,9 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace ; GFX10-LABEL: simplify_demanded_bits_test_min_slt_i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x28 -; GFX10-NEXT: s_load_dword s3, s[6:7], 0x4c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x28 +; GFX10-NEXT: s_load_dword s3, s[4:5], 0x4c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_sext_i32_i16 s2, s2 @@ -3446,13 +3432,13 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace ; GFX11-LABEL: simplify_demanded_bits_test_min_slt_i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x28 -; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x4c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x28 +; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x4c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_sext_i32_i16 s2, s4 -; GFX11-NEXT: s_sext_i32_i16 s3, s5 +; GFX11-NEXT: s_sext_i32_i16 s2, s2 +; GFX11-NEXT: s_sext_i32_i16 s3, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_min_i32 s2, s2, s3 ; GFX11-NEXT: v_mov_b32_e32 v1, s2 @@ -3503,8 +3489,8 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1 ; ; CI-LABEL: s_test_imin_sle_i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_sext_i32_i16 s3, s2 ; CI-NEXT: s_ashr_i32 s2, s2, 16 @@ -3517,8 +3503,8 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1 ; ; VI-LABEL: s_test_imin_sle_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_sext_i32_i16 s3, s2 ; VI-NEXT: s_ashr_i32 s2, s2, 16 @@ -3531,8 +3517,8 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1 ; ; GFX9-LABEL: s_test_imin_sle_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sext_i32_i16 s3, s2 @@ -3545,8 +3531,8 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1 ; GFX10-LABEL: s_test_imin_sle_i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_sext_i32_i16 s3, s2 @@ -3559,14 +3545,14 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1 ; GFX11-LABEL: s_test_imin_sle_i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_sext_i32_i16 s2, s4 -; GFX11-NEXT: s_ashr_i32 s3, s4, 16 +; GFX11-NEXT: s_sext_i32_i16 s3, s2 +; GFX11-NEXT: s_ashr_i32 s2, s2, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_min_i32 s2, s2, s3 +; GFX11-NEXT: s_min_i32 s2, s3, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -3599,8 +3585,8 @@ define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; CI-LABEL: test_umin_ult_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s4 @@ -3617,8 +3603,8 @@ define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; VI-LABEL: test_umin_ult_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s4 @@ -3635,16 +3621,16 @@ define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; GFX9-LABEL: test_umin_ult_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s3, s3, s5 -; GFX9-NEXT: s_cselect_b32 s2, s2, s4 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cselect_b32 s3, s3, s7 +; GFX9-NEXT: s_cselect_b32 s2, s2, s6 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -3653,14 +3639,14 @@ define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64 ; GFX10-LABEL: test_umin_ult_i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cmp_lt_u64_e64 s6, s[2:3], s[4:5] -; GFX10-NEXT: s_and_b32 s6, s6, exec_lo -; GFX10-NEXT: s_cselect_b32 s2, s2, s4 -; GFX10-NEXT: s_cselect_b32 s3, s3, s5 +; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[2:3], s[6:7] +; GFX10-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10-NEXT: s_cselect_b32 s2, s2, s6 +; GFX10-NEXT: s_cselect_b32 s3, s3, s7 ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -3669,8 +3655,8 @@ define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64 ; GFX11-LABEL: test_umin_ult_i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_lt_u64_e64 s2, s[6:7], s[0:1] @@ -3709,8 +3695,8 @@ define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; CI-LABEL: test_umin_ule_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s4 @@ -3727,8 +3713,8 @@ define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; VI-LABEL: test_umin_ule_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s4 @@ -3745,16 +3731,16 @@ define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; GFX9-LABEL: test_umin_ule_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_cmp_le_u64_e32 vcc, s[2:3], v[0:1] -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s3, s3, s5 -; GFX9-NEXT: s_cselect_b32 s2, s2, s4 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cselect_b32 s3, s3, s7 +; GFX9-NEXT: s_cselect_b32 s2, s2, s6 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -3763,14 +3749,14 @@ define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64 ; GFX10-LABEL: test_umin_ule_i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cmp_le_u64_e64 s6, s[2:3], s[4:5] -; GFX10-NEXT: s_and_b32 s6, s6, exec_lo -; GFX10-NEXT: s_cselect_b32 s2, s2, s4 -; GFX10-NEXT: s_cselect_b32 s3, s3, s5 +; GFX10-NEXT: v_cmp_le_u64_e64 s4, s[2:3], s[6:7] +; GFX10-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10-NEXT: s_cselect_b32 s2, s2, s6 +; GFX10-NEXT: s_cselect_b32 s3, s3, s7 ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -3779,8 +3765,8 @@ define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64 ; GFX11-LABEL: test_umin_ule_i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_le_u64_e64 s2, s[6:7], s[0:1] @@ -3819,8 +3805,8 @@ define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; CI-LABEL: test_imin_slt_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s4 @@ -3837,8 +3823,8 @@ define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; VI-LABEL: test_imin_slt_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s4 @@ -3855,16 +3841,16 @@ define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; GFX9-LABEL: test_imin_slt_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s3, s3, s5 -; GFX9-NEXT: s_cselect_b32 s2, s2, s4 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cselect_b32 s3, s3, s7 +; GFX9-NEXT: s_cselect_b32 s2, s2, s6 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -3873,14 +3859,14 @@ define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64 ; GFX10-LABEL: test_imin_slt_i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cmp_lt_i64_e64 s6, s[2:3], s[4:5] -; GFX10-NEXT: s_and_b32 s6, s6, exec_lo -; GFX10-NEXT: s_cselect_b32 s2, s2, s4 -; GFX10-NEXT: s_cselect_b32 s3, s3, s5 +; GFX10-NEXT: v_cmp_lt_i64_e64 s4, s[2:3], s[6:7] +; GFX10-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10-NEXT: s_cselect_b32 s2, s2, s6 +; GFX10-NEXT: s_cselect_b32 s3, s3, s7 ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -3889,8 +3875,8 @@ define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64 ; GFX11-LABEL: test_imin_slt_i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_lt_i64_e64 s2, s[6:7], s[0:1] @@ -3929,8 +3915,8 @@ define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; CI-LABEL: test_imin_sle_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s4 @@ -3947,8 +3933,8 @@ define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; VI-LABEL: test_imin_sle_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s4 @@ -3965,16 +3951,16 @@ define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; GFX9-LABEL: test_imin_sle_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_cmp_le_i64_e32 vcc, s[2:3], v[0:1] -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s3, s3, s5 -; GFX9-NEXT: s_cselect_b32 s2, s2, s4 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cselect_b32 s3, s3, s7 +; GFX9-NEXT: s_cselect_b32 s2, s2, s6 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -3983,14 +3969,14 @@ define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64 ; GFX10-LABEL: test_imin_sle_i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cmp_le_i64_e64 s6, s[2:3], s[4:5] -; GFX10-NEXT: s_and_b32 s6, s6, exec_lo -; GFX10-NEXT: s_cselect_b32 s2, s2, s4 -; GFX10-NEXT: s_cselect_b32 s3, s3, s5 +; GFX10-NEXT: v_cmp_le_i64_e64 s4, s[2:3], s[6:7] +; GFX10-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10-NEXT: s_cselect_b32 s2, s2, s6 +; GFX10-NEXT: s_cselect_b32 s3, s3, s7 ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -3999,8 +3985,8 @@ define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64 ; GFX11-LABEL: test_imin_sle_i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_le_i64_e64 s2, s[6:7], s[0:1] @@ -4062,8 +4048,8 @@ define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addr ; ; CI-LABEL: v_test_imin_sle_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -4093,8 +4079,8 @@ define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addr ; ; VI-LABEL: v_test_imin_sle_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -4117,12 +4103,12 @@ define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: v_test_imin_sle_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_min_i16 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] @@ -4131,13 +4117,13 @@ define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addr ; GFX10-LABEL: v_test_imin_sle_v2i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-NEXT: global_load_dword v2, v0, s[4:5] +; GFX10-NEXT: global_load_dword v2, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_min_i16 v1, v1, v2 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] @@ -4146,10 +4132,8 @@ define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addr ; GFX11-LABEL: v_test_imin_sle_v2i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -4214,8 +4198,8 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr ; ; CI-LABEL: v_test_imin_ule_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -4244,8 +4228,8 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr ; ; VI-LABEL: v_test_imin_ule_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -4268,12 +4252,12 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: v_test_imin_ule_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_min_u16 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] @@ -4282,13 +4266,13 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr ; GFX10-LABEL: v_test_imin_ule_v2i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-NEXT: global_load_dword v2, v0, s[4:5] +; GFX10-NEXT: global_load_dword v2, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_min_u16 v1, v1, v2 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] @@ -4297,10 +4281,8 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr ; GFX11-LABEL: v_test_imin_ule_v2i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll index ece7e28c763fb..c98cfa08160ca 100644 --- a/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll +++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll @@ -8,17 +8,17 @@ declare i64 @llvm.cttz.i64(i64, i1) nounwind readnone define amdgpu_kernel void @ctlz_i64_poison(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; GFX9-LABEL: ctlz_i64_poison: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5 -; GFX9-NEXT: global_load_ubyte v2, v1, s[2:3] offset:6 -; GFX9-NEXT: global_load_ubyte v3, v1, s[2:3] offset:7 -; GFX9-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1 -; GFX9-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3 -; GFX9-NEXT: global_load_ubyte v6, v1, s[2:3] offset:4 -; GFX9-NEXT: global_load_ubyte v7, v1, s[2:3] -; GFX9-NEXT: global_load_ubyte v8, v1, s[2:3] offset:2 +; GFX9-NEXT: global_load_ubyte v0, v1, s[6:7] offset:5 +; GFX9-NEXT: global_load_ubyte v2, v1, s[6:7] offset:6 +; GFX9-NEXT: global_load_ubyte v3, v1, s[6:7] offset:7 +; GFX9-NEXT: global_load_ubyte v4, v1, s[6:7] offset:1 +; GFX9-NEXT: global_load_ubyte v5, v1, s[6:7] offset:3 +; GFX9-NEXT: global_load_ubyte v6, v1, s[6:7] offset:4 +; GFX9-NEXT: global_load_ubyte v7, v1, s[6:7] +; GFX9-NEXT: global_load_ubyte v8, v1, s[6:7] offset:2 ; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(5) @@ -40,23 +40,23 @@ define amdgpu_kernel void @ctlz_i64_poison(ptr addrspace(1) noalias %out, ptr ad ; GFX9-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX9-NEXT: v_add_u32_e64 v2, v2, 32 clamp ; GFX9-NEXT: v_min_u32_e32 v0, v2, v0 -; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: ctlz_i64_poison: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x7 -; GFX10-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5 -; GFX10-NEXT: global_load_ubyte v2, v1, s[2:3] offset:6 -; GFX10-NEXT: global_load_ubyte v3, v1, s[2:3] offset:7 -; GFX10-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1 -; GFX10-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3 -; GFX10-NEXT: global_load_ubyte v6, v1, s[2:3] -; GFX10-NEXT: global_load_ubyte v7, v1, s[2:3] offset:2 -; GFX10-NEXT: global_load_ubyte v8, v1, s[2:3] offset:4 +; GFX10-NEXT: global_load_ubyte v0, v1, s[6:7] offset:5 +; GFX10-NEXT: global_load_ubyte v2, v1, s[6:7] offset:6 +; GFX10-NEXT: global_load_ubyte v3, v1, s[6:7] offset:7 +; GFX10-NEXT: global_load_ubyte v4, v1, s[6:7] offset:1 +; GFX10-NEXT: global_load_ubyte v5, v1, s[6:7] offset:3 +; GFX10-NEXT: global_load_ubyte v6, v1, s[6:7] +; GFX10-NEXT: global_load_ubyte v7, v1, s[6:7] offset:2 +; GFX10-NEXT: global_load_ubyte v8, v1, s[6:7] offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(7) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX10-NEXT: s_waitcnt vmcnt(5) @@ -76,7 +76,7 @@ define amdgpu_kernel void @ctlz_i64_poison(ptr addrspace(1) noalias %out, ptr ad ; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX10-NEXT: v_add_nc_u32_e64 v2, v2, 32 clamp ; GFX10-NEXT: v_min_u32_e32 v0, v2, v0 -; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm %val = load i64, ptr addrspace(1) %arrayidx, align 1 %ctlz = tail call i64 @llvm.ctlz.i64(i64 %val, i1 true) nounwind readnone @@ -87,17 +87,17 @@ define amdgpu_kernel void @ctlz_i64_poison(ptr addrspace(1) noalias %out, ptr ad define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; GFX9-LABEL: ctlz_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5 -; GFX9-NEXT: global_load_ubyte v2, v1, s[2:3] offset:6 -; GFX9-NEXT: global_load_ubyte v3, v1, s[2:3] offset:7 -; GFX9-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1 -; GFX9-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3 -; GFX9-NEXT: global_load_ubyte v6, v1, s[2:3] offset:4 -; GFX9-NEXT: global_load_ubyte v7, v1, s[2:3] -; GFX9-NEXT: global_load_ubyte v8, v1, s[2:3] offset:2 +; GFX9-NEXT: global_load_ubyte v0, v1, s[6:7] offset:5 +; GFX9-NEXT: global_load_ubyte v2, v1, s[6:7] offset:6 +; GFX9-NEXT: global_load_ubyte v3, v1, s[6:7] offset:7 +; GFX9-NEXT: global_load_ubyte v4, v1, s[6:7] offset:1 +; GFX9-NEXT: global_load_ubyte v5, v1, s[6:7] offset:3 +; GFX9-NEXT: global_load_ubyte v6, v1, s[6:7] offset:4 +; GFX9-NEXT: global_load_ubyte v7, v1, s[6:7] +; GFX9-NEXT: global_load_ubyte v8, v1, s[6:7] offset:2 ; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(5) @@ -120,23 +120,23 @@ define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace ; GFX9-NEXT: v_add_u32_e64 v2, v2, 32 clamp ; GFX9-NEXT: v_min_u32_e32 v0, v2, v0 ; GFX9-NEXT: v_min_u32_e32 v0, 64, v0 -; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: ctlz_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x7 -; GFX10-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5 -; GFX10-NEXT: global_load_ubyte v2, v1, s[2:3] offset:6 -; GFX10-NEXT: global_load_ubyte v3, v1, s[2:3] offset:7 -; GFX10-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1 -; GFX10-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3 -; GFX10-NEXT: global_load_ubyte v6, v1, s[2:3] -; GFX10-NEXT: global_load_ubyte v7, v1, s[2:3] offset:2 -; GFX10-NEXT: global_load_ubyte v8, v1, s[2:3] offset:4 +; GFX10-NEXT: global_load_ubyte v0, v1, s[6:7] offset:5 +; GFX10-NEXT: global_load_ubyte v2, v1, s[6:7] offset:6 +; GFX10-NEXT: global_load_ubyte v3, v1, s[6:7] offset:7 +; GFX10-NEXT: global_load_ubyte v4, v1, s[6:7] offset:1 +; GFX10-NEXT: global_load_ubyte v5, v1, s[6:7] offset:3 +; GFX10-NEXT: global_load_ubyte v6, v1, s[6:7] +; GFX10-NEXT: global_load_ubyte v7, v1, s[6:7] offset:2 +; GFX10-NEXT: global_load_ubyte v8, v1, s[6:7] offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(7) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX10-NEXT: s_waitcnt vmcnt(5) @@ -157,7 +157,7 @@ define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace ; GFX10-NEXT: v_add_nc_u32_e64 v2, v2, 32 clamp ; GFX10-NEXT: v_min_u32_e32 v0, v2, v0 ; GFX10-NEXT: v_min_u32_e32 v0, 64, v0 -; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm %val = load i64, ptr addrspace(1) %arrayidx, align 1 %ctlz = tail call i64 @llvm.ctlz.i64(i64 %val, i1 false) nounwind readnone @@ -168,17 +168,17 @@ define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace define amdgpu_kernel void @cttz_i64_poison(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; GFX9-LABEL: cttz_i64_poison: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5 -; GFX9-NEXT: global_load_ubyte v2, v1, s[2:3] offset:6 -; GFX9-NEXT: global_load_ubyte v3, v1, s[2:3] offset:7 -; GFX9-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1 -; GFX9-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3 -; GFX9-NEXT: global_load_ubyte v6, v1, s[2:3] offset:4 -; GFX9-NEXT: global_load_ubyte v7, v1, s[2:3] -; GFX9-NEXT: global_load_ubyte v8, v1, s[2:3] offset:2 +; GFX9-NEXT: global_load_ubyte v0, v1, s[6:7] offset:5 +; GFX9-NEXT: global_load_ubyte v2, v1, s[6:7] offset:6 +; GFX9-NEXT: global_load_ubyte v3, v1, s[6:7] offset:7 +; GFX9-NEXT: global_load_ubyte v4, v1, s[6:7] offset:1 +; GFX9-NEXT: global_load_ubyte v5, v1, s[6:7] offset:3 +; GFX9-NEXT: global_load_ubyte v6, v1, s[6:7] offset:4 +; GFX9-NEXT: global_load_ubyte v7, v1, s[6:7] +; GFX9-NEXT: global_load_ubyte v8, v1, s[6:7] offset:2 ; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(5) @@ -200,23 +200,23 @@ define amdgpu_kernel void @cttz_i64_poison(ptr addrspace(1) noalias %out, ptr ad ; GFX9-NEXT: v_ffbl_b32_e32 v2, v2 ; GFX9-NEXT: v_add_u32_e64 v0, v0, 32 clamp ; GFX9-NEXT: v_min_u32_e32 v0, v0, v2 -; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: cttz_i64_poison: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x7 -; GFX10-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5 -; GFX10-NEXT: global_load_ubyte v2, v1, s[2:3] offset:7 -; GFX10-NEXT: global_load_ubyte v3, v1, s[2:3] offset:6 -; GFX10-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1 -; GFX10-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3 -; GFX10-NEXT: global_load_ubyte v6, v1, s[2:3] offset:4 -; GFX10-NEXT: global_load_ubyte v7, v1, s[2:3] -; GFX10-NEXT: global_load_ubyte v8, v1, s[2:3] offset:2 +; GFX10-NEXT: global_load_ubyte v0, v1, s[6:7] offset:5 +; GFX10-NEXT: global_load_ubyte v2, v1, s[6:7] offset:7 +; GFX10-NEXT: global_load_ubyte v3, v1, s[6:7] offset:6 +; GFX10-NEXT: global_load_ubyte v4, v1, s[6:7] offset:1 +; GFX10-NEXT: global_load_ubyte v5, v1, s[6:7] offset:3 +; GFX10-NEXT: global_load_ubyte v6, v1, s[6:7] offset:4 +; GFX10-NEXT: global_load_ubyte v7, v1, s[6:7] +; GFX10-NEXT: global_load_ubyte v8, v1, s[6:7] offset:2 ; GFX10-NEXT: s_waitcnt vmcnt(7) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX10-NEXT: s_waitcnt vmcnt(6) @@ -238,7 +238,7 @@ define amdgpu_kernel void @cttz_i64_poison(ptr addrspace(1) noalias %out, ptr ad ; GFX10-NEXT: v_ffbl_b32_e32 v2, v2 ; GFX10-NEXT: v_add_nc_u32_e64 v0, v0, 32 clamp ; GFX10-NEXT: v_min_u32_e32 v0, v0, v2 -; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm %val = load i64, ptr addrspace(1) %arrayidx, align 1 %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) nounwind readnone @@ -249,17 +249,17 @@ define amdgpu_kernel void @cttz_i64_poison(ptr addrspace(1) noalias %out, ptr ad define amdgpu_kernel void @cttz_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; GFX9-LABEL: cttz_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5 -; GFX9-NEXT: global_load_ubyte v2, v1, s[2:3] offset:6 -; GFX9-NEXT: global_load_ubyte v3, v1, s[2:3] offset:7 -; GFX9-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1 -; GFX9-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3 -; GFX9-NEXT: global_load_ubyte v6, v1, s[2:3] offset:4 -; GFX9-NEXT: global_load_ubyte v7, v1, s[2:3] -; GFX9-NEXT: global_load_ubyte v8, v1, s[2:3] offset:2 +; GFX9-NEXT: global_load_ubyte v0, v1, s[6:7] offset:5 +; GFX9-NEXT: global_load_ubyte v2, v1, s[6:7] offset:6 +; GFX9-NEXT: global_load_ubyte v3, v1, s[6:7] offset:7 +; GFX9-NEXT: global_load_ubyte v4, v1, s[6:7] offset:1 +; GFX9-NEXT: global_load_ubyte v5, v1, s[6:7] offset:3 +; GFX9-NEXT: global_load_ubyte v6, v1, s[6:7] offset:4 +; GFX9-NEXT: global_load_ubyte v7, v1, s[6:7] +; GFX9-NEXT: global_load_ubyte v8, v1, s[6:7] offset:2 ; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(5) @@ -282,23 +282,23 @@ define amdgpu_kernel void @cttz_i64(ptr addrspace(1) noalias %out, ptr addrspace ; GFX9-NEXT: v_add_u32_e64 v0, v0, 32 clamp ; GFX9-NEXT: v_min_u32_e32 v0, v0, v2 ; GFX9-NEXT: v_min_u32_e32 v0, 64, v0 -; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: cttz_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x7 -; GFX10-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5 -; GFX10-NEXT: global_load_ubyte v2, v1, s[2:3] offset:7 -; GFX10-NEXT: global_load_ubyte v3, v1, s[2:3] offset:6 -; GFX10-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1 -; GFX10-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3 -; GFX10-NEXT: global_load_ubyte v6, v1, s[2:3] offset:4 -; GFX10-NEXT: global_load_ubyte v7, v1, s[2:3] -; GFX10-NEXT: global_load_ubyte v8, v1, s[2:3] offset:2 +; GFX10-NEXT: global_load_ubyte v0, v1, s[6:7] offset:5 +; GFX10-NEXT: global_load_ubyte v2, v1, s[6:7] offset:7 +; GFX10-NEXT: global_load_ubyte v3, v1, s[6:7] offset:6 +; GFX10-NEXT: global_load_ubyte v4, v1, s[6:7] offset:1 +; GFX10-NEXT: global_load_ubyte v5, v1, s[6:7] offset:3 +; GFX10-NEXT: global_load_ubyte v6, v1, s[6:7] offset:4 +; GFX10-NEXT: global_load_ubyte v7, v1, s[6:7] +; GFX10-NEXT: global_load_ubyte v8, v1, s[6:7] offset:2 ; GFX10-NEXT: s_waitcnt vmcnt(7) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX10-NEXT: s_waitcnt vmcnt(6) @@ -321,7 +321,7 @@ define amdgpu_kernel void @cttz_i64(ptr addrspace(1) noalias %out, ptr addrspace ; GFX10-NEXT: v_add_nc_u32_e64 v0, v0, 32 clamp ; GFX10-NEXT: v_min_u32_e32 v0, v0, v2 ; GFX10-NEXT: v_min_u32_e32 v0, 64, v0 -; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm %val = load i64, ptr addrspace(1) %arrayidx, align 1 %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 false) nounwind readnone diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll index 0889f8ef6316e..b4272049f36a4 100644 --- a/llvm/test/CodeGen/AMDGPU/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/mul.ll @@ -12,7 +12,7 @@ define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: test_mul_v2i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -31,7 +31,7 @@ define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: test_mul_v2i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -50,7 +50,7 @@ define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX9-LABEL: test_mul_v2i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -69,7 +69,7 @@ define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX10-LABEL: test_mul_v2i32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s10, s6 @@ -88,7 +88,7 @@ define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX11-LABEL: test_mul_v2i32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -109,7 +109,7 @@ define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX12-LABEL: test_mul_v2i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_mov_b32 s6, -1 ; GFX12-NEXT: s_mov_b32 s7, 0x31016000 ; GFX12-NEXT: s_mov_b32 s10, s6 @@ -157,7 +157,7 @@ entry: define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_mul_v4i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -179,7 +179,7 @@ define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-LABEL: v_mul_v4i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -201,7 +201,7 @@ define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: v_mul_v4i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -223,7 +223,7 @@ define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX10-LABEL: v_mul_v4i32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s10, s6 @@ -246,7 +246,7 @@ define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX11-LABEL: v_mul_v4i32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -271,7 +271,7 @@ define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX12-LABEL: v_mul_v4i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_mov_b32 s6, -1 ; GFX12-NEXT: s_mov_b32 s7, 0x31016000 ; GFX12-NEXT: s_mov_b32 s10, s6 @@ -326,9 +326,9 @@ entry: define amdgpu_kernel void @s_trunc_i64_mul_to_i32(ptr addrspace(1) %out, i64 %a, i64 %b) { ; SI-LABEL: s_trunc_i64_mul_to_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dword s7, s[2:3], 0xd +; SI-NEXT: s_load_dword s7, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s0, s4 @@ -341,9 +341,9 @@ define amdgpu_kernel void @s_trunc_i64_mul_to_i32(ptr addrspace(1) %out, i64 %a, ; ; VI-LABEL: s_trunc_i64_mul_to_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s7, s[2:3], 0x34 +; VI-NEXT: s_load_dword s7, s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s4 @@ -356,10 +356,10 @@ define amdgpu_kernel void @s_trunc_i64_mul_to_i32(ptr addrspace(1) %out, i64 %a, ; ; GFX9-LABEL: s_trunc_i64_mul_to_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s7, s[2:3], 0x34 -; GFX9-NEXT: ; kill: killed $sgpr2_sgpr3 +; GFX9-NEXT: s_load_dword s7, s[0:1], 0x34 +; GFX9-NEXT: ; kill: killed $sgpr0_sgpr1 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_mov_b32 s0, s4 @@ -373,11 +373,11 @@ define amdgpu_kernel void @s_trunc_i64_mul_to_i32(ptr addrspace(1) %out, i64 %a, ; GFX10-LABEL: s_trunc_i64_mul_to_i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mul_i32 s0, s0, s6 +; GFX10-NEXT: s_mul_i32 s0, s2, s6 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -386,8 +386,8 @@ define amdgpu_kernel void @s_trunc_i64_mul_to_i32(ptr addrspace(1) %out, i64 %a, ; GFX11-LABEL: s_trunc_i64_mul_to_i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mul_i32 s0, s0, s6 @@ -401,8 +401,8 @@ define amdgpu_kernel void @s_trunc_i64_mul_to_i32(ptr addrspace(1) %out, i64 %a, ; GFX12-LABEL: s_trunc_i64_mul_to_i32: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 s7, 0x31016000 ; GFX12-NEXT: s_mul_i32 s0, s0, s6 @@ -433,98 +433,98 @@ entry: define amdgpu_kernel void @v_trunc_i64_mul_to_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; SI-LABEL: v_trunc_i64_mul_to_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s14, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_lo_u32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_trunc_i64_mul_to_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s14, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mul_lo_u32 v0, v1, v0 -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_trunc_i64_mul_to_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_mov_b32 s11, 0xf000 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s14, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s12, s6 ; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: s_mov_b32 s15, s11 -; GFX9-NEXT: s_mov_b32 s2, s10 -; GFX9-NEXT: s_mov_b32 s3, s11 +; GFX9-NEXT: s_mov_b32 s15, s3 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 ; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; GFX9-NEXT: s_mov_b32 s8, s4 -; GFX9-NEXT: s_mov_b32 s9, s5 +; GFX9-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, v1, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_trunc_i64_mul_to_i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_mov_b32 s10, -1 -; GFX10-NEXT: s_mov_b32 s11, 0x31016000 -; GFX10-NEXT: s_mov_b32 s14, s10 -; GFX10-NEXT: s_mov_b32 s15, s11 -; GFX10-NEXT: s_mov_b32 s2, s10 -; GFX10-NEXT: s_mov_b32 s3, s11 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s14, s2 +; GFX10-NEXT: s_mov_b32 s15, s3 +; GFX10-NEXT: s_mov_b32 s10, s2 +; GFX10-NEXT: s_mov_b32 s11, s3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_mov_b32 s12, s6 ; GFX10-NEXT: s_mov_b32 s13, s7 ; GFX10-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; GFX10-NEXT: s_mov_b32 s8, s4 -; GFX10-NEXT: s_mov_b32 s9, s5 +; GFX10-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; GFX10-NEXT: s_mov_b32 s0, s4 +; GFX10-NEXT: s_mov_b32 s1, s5 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_lo_u32 v0, v1, v0 -; GFX10-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_trunc_i64_mul_to_i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -548,8 +548,8 @@ define amdgpu_kernel void @v_trunc_i64_mul_to_i32(ptr addrspace(1) %out, ptr add ; GFX12-LABEL: v_trunc_i64_mul_to_i32: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_mov_b32 s10, -1 ; GFX12-NEXT: s_mov_b32 s11, 0x31016000 ; GFX12-NEXT: s_mov_b32 s14, s10 @@ -603,8 +603,8 @@ entry: define amdgpu_kernel void @mul64_sext_c(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: mul64_sext_c: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: v_mov_b32_e32 v0, 0x50 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -617,11 +617,11 @@ define amdgpu_kernel void @mul64_sext_c(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: mul64_sext_c: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x50 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mad_i64_i32 v[0:1], s[2:3], s4, v0, 0 +; VI-NEXT: v_mad_i64_i32 v[0:1], s[2:3], s2, v0, 0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_nop 2 @@ -630,43 +630,43 @@ define amdgpu_kernel void @mul64_sext_c(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: mul64_sext_c: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_hi_i32 s5, s4, 0x50 -; GFX9-NEXT: s_mulk_i32 s4, 0x50 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: s_mul_hi_i32 s0, s2, 0x50 +; GFX9-NEXT: s_mulk_i32 s2, 0x50 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: mul64_sext_c: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mul_i32 s2, s4, 0x50 -; GFX10-NEXT: s_mul_hi_i32 s3, s4, 0x50 -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-NEXT: s_mov_b32 s2, -1 -; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX10-NEXT: s_mul_i32 s0, s2, 0x50 +; GFX10-NEXT: s_mul_hi_i32 s1, s2, 0x50 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: mul64_sext_c: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mul_i32 s2, s4, 0x50 -; GFX11-NEXT: s_mul_hi_i32 s3, s4, 0x50 +; GFX11-NEXT: s_mul_i32 s3, s2, 0x50 +; GFX11-NEXT: s_mul_hi_i32 s2, s2, 0x50 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 @@ -676,7 +676,7 @@ define amdgpu_kernel void @mul64_sext_c(ptr addrspace(1) %out, i32 %in) { ; ; GFX12-LABEL: mul64_sext_c: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_ashr_i32 s3, s2, 31 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -711,8 +711,8 @@ entry: define amdgpu_kernel void @mul64_zext_c(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: mul64_zext_c: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: v_mov_b32_e32 v0, 0x50 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -725,11 +725,11 @@ define amdgpu_kernel void @mul64_zext_c(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: mul64_zext_c: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x50 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s4, v0, 0 +; VI-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s2, v0, 0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_nop 2 @@ -738,43 +738,43 @@ define amdgpu_kernel void @mul64_zext_c(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: mul64_zext_c: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_hi_u32 s5, s4, 0x50 -; GFX9-NEXT: s_mulk_i32 s4, 0x50 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: s_mul_hi_u32 s0, s2, 0x50 +; GFX9-NEXT: s_mulk_i32 s2, 0x50 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: mul64_zext_c: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mul_i32 s2, s4, 0x50 -; GFX10-NEXT: s_mul_hi_u32 s3, s4, 0x50 -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-NEXT: s_mov_b32 s2, -1 -; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX10-NEXT: s_mul_i32 s0, s2, 0x50 +; GFX10-NEXT: s_mul_hi_u32 s1, s2, 0x50 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: mul64_zext_c: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mul_i32 s2, s4, 0x50 -; GFX11-NEXT: s_mul_hi_u32 s3, s4, 0x50 +; GFX11-NEXT: s_mul_i32 s3, s2, 0x50 +; GFX11-NEXT: s_mul_hi_u32 s2, s2, 0x50 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 @@ -784,7 +784,7 @@ define amdgpu_kernel void @mul64_zext_c(ptr addrspace(1) %out, i32 %in) { ; ; GFX12-LABEL: mul64_zext_c: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-NEXT: s_mov_b32 s3, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mul_u64 s[4:5], s[2:3], 0x50 @@ -818,7 +818,7 @@ entry: define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_mul64_sext_c: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -838,7 +838,7 @@ define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: v_mul64_sext_c: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -857,7 +857,7 @@ define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX9-LABEL: v_mul64_sext_c: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -877,7 +877,7 @@ define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX10-LABEL: v_mul64_sext_c: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s10, s6 @@ -896,7 +896,7 @@ define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX11-LABEL: v_mul64_sext_c: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -917,7 +917,7 @@ define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX12-LABEL: v_mul64_sext_c: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_mov_b32 s6, -1 ; GFX12-NEXT: s_mov_b32 s7, 0x31016000 ; GFX12-NEXT: s_mov_b32 s10, s6 @@ -965,7 +965,7 @@ entry: define amdgpu_kernel void @v_mul64_zext_c(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_mul64_zext_c: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -985,7 +985,7 @@ define amdgpu_kernel void @v_mul64_zext_c(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: v_mul64_zext_c: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1004,7 +1004,7 @@ define amdgpu_kernel void @v_mul64_zext_c(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX9-LABEL: v_mul64_zext_c: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -1024,7 +1024,7 @@ define amdgpu_kernel void @v_mul64_zext_c(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX10-LABEL: v_mul64_zext_c: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s10, s6 @@ -1043,7 +1043,7 @@ define amdgpu_kernel void @v_mul64_zext_c(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX11-LABEL: v_mul64_zext_c: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -1064,7 +1064,7 @@ define amdgpu_kernel void @v_mul64_zext_c(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX12-LABEL: v_mul64_zext_c: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_mov_b32 s6, -1 ; GFX12-NEXT: s_mov_b32 s7, 0x31016000 ; GFX12-NEXT: s_mov_b32 s10, s6 @@ -1112,7 +1112,7 @@ entry: define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_mul64_sext_inline_imm: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1131,7 +1131,7 @@ define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: v_mul64_sext_inline_imm: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1149,7 +1149,7 @@ define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr ad ; ; GFX9-LABEL: v_mul64_sext_inline_imm: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -1168,7 +1168,7 @@ define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr ad ; ; GFX10-LABEL: v_mul64_sext_inline_imm: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s10, s6 @@ -1187,7 +1187,7 @@ define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_mul64_sext_inline_imm: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -1208,7 +1208,7 @@ define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr ad ; ; GFX12-LABEL: v_mul64_sext_inline_imm: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_mov_b32 s6, -1 ; GFX12-NEXT: s_mov_b32 s7, 0x31016000 ; GFX12-NEXT: s_mov_b32 s10, s6 @@ -1256,9 +1256,9 @@ entry: define amdgpu_kernel void @s_mul_i32(ptr addrspace(1) %out, [8 x i32], i32 %a, [8 x i32], i32 %b) nounwind { ; SI-LABEL: s_mul_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x13 -; SI-NEXT: s_load_dword s5, s[2:3], 0x1c -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0x13 +; SI-NEXT: s_load_dword s5, s[0:1], 0x1c +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1269,9 +1269,9 @@ define amdgpu_kernel void @s_mul_i32(ptr addrspace(1) %out, [8 x i32], i32 %a, [ ; ; VI-LABEL: s_mul_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x4c -; VI-NEXT: s_load_dword s5, s[2:3], 0x70 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x4c +; VI-NEXT: s_load_dword s5, s[0:1], 0x70 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1282,41 +1282,40 @@ define amdgpu_kernel void @s_mul_i32(ptr addrspace(1) %out, [8 x i32], i32 %a, [ ; ; GFX9-LABEL: s_mul_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x4c -; GFX9-NEXT: s_load_dword s5, s[2:3], 0x70 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x4c +; GFX9-NEXT: s_load_dword s3, s[0:1], 0x70 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s4, s4, s5 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_mul_i32 s0, s2, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_mul_i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x4c -; GFX10-NEXT: s_load_dword s5, s[2:3], 0x70 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x4c +; GFX10-NEXT: s_load_dword s3, s[0:1], 0x70 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mul_i32 s2, s4, s5 -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: s_mov_b32 s2, -1 -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-NEXT: s_mul_i32 s0, s2, s3 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_mul_i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x4c -; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x70 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x4c +; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x70 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mul_i32 s2, s4, s5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mul_i32 s2, s2, s3 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -1327,13 +1326,12 @@ define amdgpu_kernel void @s_mul_i32(ptr addrspace(1) %out, [8 x i32], i32 %a, [ ; GFX12-LABEL: s_mul_i32: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x2 -; GFX12-NEXT: s_load_b32 s4, s[2:3], 0x4c -; GFX12-NEXT: s_load_b32 s5, s[2:3], 0x70 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-NEXT: s_mov_b32 s3, 0x31016000 +; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x4c +; GFX12-NEXT: s_load_b32 s3, s[0:1], 0x70 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_mul_i32 s2, s4, s5 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mul_i32 s2, s2, s3 +; GFX12-NEXT: s_mov_b32 s3, 0x31016000 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: s_mov_b32 s2, -1 ; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null @@ -1360,7 +1358,7 @@ entry: define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_mul_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1378,7 +1376,7 @@ define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; VI-LABEL: v_mul_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1396,7 +1394,7 @@ define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX9-LABEL: v_mul_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -1414,7 +1412,7 @@ define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX10-LABEL: v_mul_i32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s10, s6 @@ -1432,7 +1430,7 @@ define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX11-LABEL: v_mul_i32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -1452,7 +1450,7 @@ define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX12-LABEL: v_mul_i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_mov_b32 s6, -1 ; GFX12-NEXT: s_mov_b32 s7, 0x31016000 ; GFX12-NEXT: s_mov_b32 s10, s6 @@ -1498,9 +1496,9 @@ entry: define amdgpu_kernel void @s_mul_i1(ptr addrspace(1) %out, [8 x i32], i1 %a, [8 x i32], i1 %b) nounwind { ; SI-LABEL: s_mul_i1: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x13 -; SI-NEXT: s_load_dword s5, s[2:3], 0x1c -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0x13 +; SI-NEXT: s_load_dword s5, s[0:1], 0x1c +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1512,9 +1510,9 @@ define amdgpu_kernel void @s_mul_i1(ptr addrspace(1) %out, [8 x i32], i1 %a, [8 ; ; VI-LABEL: s_mul_i1: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x70 -; VI-NEXT: s_load_dword s5, s[2:3], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x70 +; VI-NEXT: s_load_dword s5, s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1526,42 +1524,42 @@ define amdgpu_kernel void @s_mul_i1(ptr addrspace(1) %out, [8 x i32], i1 %a, [8 ; ; GFX9-LABEL: s_mul_i1: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x70 -; GFX9-NEXT: s_load_dword s5, s[2:3], 0x4c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x70 +; GFX9-NEXT: s_load_dword s3, s[0:1], 0x4c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mul_lo_u16_e32 v0, s5, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mul_lo_u16_e32 v0, s3, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; GFX9-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_mul_i1: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x4c -; GFX10-NEXT: s_load_dword s5, s[2:3], 0x70 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX10-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x4c +; GFX10-NEXT: s_load_dword s3, s[0:1], 0x70 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mul_lo_u16 v0, s4, s5 +; GFX10-NEXT: v_mul_lo_u16 v0, s2, s3 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; GFX10-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_mul_i1: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x4c -; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x70 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x4c +; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x70 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mul_lo_u16 v0, s2, s3 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mul_lo_u16 v0, s4, s5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX11-NEXT: buffer_store_b8 v0, off, s[0:3], 0 @@ -1572,13 +1570,13 @@ define amdgpu_kernel void @s_mul_i1(ptr addrspace(1) %out, [8 x i32], i1 %a, [8 ; GFX12-LABEL: s_mul_i1: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x2 -; GFX12-NEXT: s_load_b32 s4, s[2:3], 0x4c -; GFX12-NEXT: s_load_b32 s5, s[2:3], 0x70 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x4c +; GFX12-NEXT: s_load_b32 s3, s[0:1], 0x70 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mul_lo_u16 v0, s2, s3 ; GFX12-NEXT: s_mov_b32 s3, 0x31016000 ; GFX12-NEXT: s_mov_b32 s2, -1 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mul_lo_u16 v0, s4, s5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX12-NEXT: buffer_store_b8 v0, off, s[0:3], null @@ -1622,7 +1620,7 @@ entry: define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_mul_i1: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1642,7 +1640,7 @@ define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; VI-LABEL: v_mul_i1: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1662,7 +1660,7 @@ define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; GFX9-LABEL: v_mul_i1: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -1682,7 +1680,7 @@ define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; GFX10-LABEL: v_mul_i1: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s10, s6 @@ -1703,7 +1701,7 @@ define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; GFX11-LABEL: v_mul_i1: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -1727,7 +1725,7 @@ define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; GFX12-LABEL: v_mul_i1: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_mov_b32 s6, -1 ; GFX12-NEXT: s_mov_b32 s7, 0x31016000 ; GFX12-NEXT: s_mov_b32 s10, s6 @@ -1795,8 +1793,8 @@ entry: define amdgpu_kernel void @s_mul_i64(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind { ; SI-LABEL: s_mul_i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1815,8 +1813,8 @@ define amdgpu_kernel void @s_mul_i64(ptr addrspace(1) %out, i64 %a, i64 %b) noun ; ; VI-LABEL: s_mul_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1833,8 +1831,8 @@ define amdgpu_kernel void @s_mul_i64(ptr addrspace(1) %out, i64 %a, i64 %b) noun ; ; GFX9-LABEL: s_mul_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1854,18 +1852,18 @@ define amdgpu_kernel void @s_mul_i64(ptr addrspace(1) %out, i64 %a, i64 %b) noun ; GFX10-LABEL: s_mul_i64: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mul_i32 s1, s6, s1 -; GFX10-NEXT: s_mul_hi_u32 s2, s6, s0 -; GFX10-NEXT: s_add_i32 s1, s2, s1 -; GFX10-NEXT: s_mul_i32 s2, s7, s0 -; GFX10-NEXT: s_mul_i32 s0, s6, s0 -; GFX10-NEXT: s_add_i32 s1, s1, s2 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_mul_i32 s0, s6, s3 +; GFX10-NEXT: s_mul_hi_u32 s1, s6, s2 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_add_i32 s0, s1, s0 +; GFX10-NEXT: s_mul_i32 s1, s7, s2 +; GFX10-NEXT: s_mul_i32 s2, s6, s2 +; GFX10-NEXT: s_add_i32 s0, s0, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-NEXT: s_mov_b32 s2, -1 ; GFX10-NEXT: s_mov_b32 s0, s4 ; GFX10-NEXT: s_mov_b32 s1, s5 @@ -1875,8 +1873,8 @@ define amdgpu_kernel void @s_mul_i64(ptr addrspace(1) %out, i64 %a, i64 %b) noun ; GFX11-LABEL: s_mul_i64: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_mul_i32 s1, s6, s1 @@ -1898,8 +1896,8 @@ define amdgpu_kernel void @s_mul_i64(ptr addrspace(1) %out, i64 %a, i64 %b) noun ; GFX12-LABEL: s_mul_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mul_u64 s[0:1], s[6:7], s[0:1] ; GFX12-NEXT: s_mov_b32 s7, 0x31016000 @@ -1934,21 +1932,21 @@ entry: define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) { ; SI-LABEL: v_mul_i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s14, s10 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_lo_u32 v1, v2, v1 ; SI-NEXT: v_mul_hi_u32 v4, v2, v0 @@ -1956,52 +1954,52 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap ; SI-NEXT: v_mul_lo_u32 v0, v2, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, v1, v4 ; SI-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_mul_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ; VI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mul_lo_u32 v4, v2, v1 -; VI-NEXT: v_mad_u64_u32 v[1:2], s[0:1], v2, v0, 0 +; VI-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v2, v0, 0 ; VI-NEXT: v_mul_lo_u32 v0, v3, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, v4, v2 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: buffer_store_dwordx2 v[1:2], off, s[8:11], 0 +; VI-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_mul_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_mov_b32 s11, 0xf000 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s2, s10 -; GFX9-NEXT: s_mov_b32 s3, s11 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s12, s6 ; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b32 s15, s11 -; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: s_mov_b32 s14, s2 +; GFX9-NEXT: s_mov_b32 s15, s3 +; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ; GFX9-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 -; GFX9-NEXT: s_mov_b32 s8, s4 -; GFX9-NEXT: s_mov_b32 s9, s5 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v1, v2, v1 ; GFX9-NEXT: v_mul_hi_u32 v4, v2, v0 @@ -2009,27 +2007,27 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap ; GFX9-NEXT: v_mul_lo_u32 v0, v2, v0 ; GFX9-NEXT: v_add_u32_e32 v1, v4, v1 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_mul_i64: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_mov_b32 s10, -1 -; GFX10-NEXT: s_mov_b32 s11, 0x31016000 -; GFX10-NEXT: s_mov_b32 s2, s10 -; GFX10-NEXT: s_mov_b32 s3, s11 -; GFX10-NEXT: s_mov_b32 s14, s10 -; GFX10-NEXT: s_mov_b32 s15, s11 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s10, s2 +; GFX10-NEXT: s_mov_b32 s11, s3 +; GFX10-NEXT: s_mov_b32 s14, s2 +; GFX10-NEXT: s_mov_b32 s15, s3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_mov_b32 s12, s6 ; GFX10-NEXT: s_mov_b32 s13, s7 -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ; GFX10-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 -; GFX10-NEXT: s_mov_b32 s8, s4 -; GFX10-NEXT: s_mov_b32 s9, s5 +; GFX10-NEXT: s_mov_b32 s0, s4 +; GFX10-NEXT: s_mov_b32 s1, s5 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_lo_u32 v1, v2, v1 ; GFX10-NEXT: v_mul_hi_u32 v4, v2, v0 @@ -2037,14 +2035,14 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap ; GFX10-NEXT: v_mul_lo_u32 v0, v2, v0 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v4, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 -; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_mul_i64: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, s10 @@ -2074,8 +2072,8 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap ; GFX12-LABEL: v_mul_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_mov_b32 s10, -1 ; GFX12-NEXT: s_mov_b32 s11, 0x31016000 ; GFX12-NEXT: s_mov_b32 s2, s10 @@ -2136,19 +2134,19 @@ entry: define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %a, i32 %b, i32 %c) { ; SI-LABEL: mul32_in_branch: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s0, 0 +; SI-NEXT: s_cmp_lg_u32 s2, 0 ; SI-NEXT: s_cbranch_scc0 .LBB15_2 ; SI-NEXT: ; %bb.1: ; %else -; SI-NEXT: s_mul_i32 s6, s0, s1 +; SI-NEXT: s_mul_i32 s6, s2, s3 ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: s_branch .LBB15_3 ; SI-NEXT: .LBB15_2: ; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: .LBB15_3: ; %Flow -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b64 vcc, vcc @@ -2171,19 +2169,19 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: mul32_in_branch: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s0, 0 +; VI-NEXT: s_cmp_lg_u32 s2, 0 ; VI-NEXT: s_cbranch_scc0 .LBB15_2 ; VI-NEXT: ; %bb.1: ; %else -; VI-NEXT: s_mul_i32 s6, s0, s1 +; VI-NEXT: s_mul_i32 s6, s2, s3 ; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_branch .LBB15_3 ; VI-NEXT: .LBB15_2: ; VI-NEXT: s_mov_b64 s[4:5], -1 ; VI-NEXT: ; implicit-def: $sgpr6 ; VI-NEXT: .LBB15_3: ; %Flow -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; VI-NEXT: s_cbranch_vccnz .LBB15_5 ; VI-NEXT: ; %bb.4: ; %if @@ -2206,19 +2204,19 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: mul32_in_branch: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_cmp_lg_u32 s0, 0 +; GFX9-NEXT: s_cmp_lg_u32 s2, 0 ; GFX9-NEXT: s_cbranch_scc0 .LBB15_2 ; GFX9-NEXT: ; %bb.1: ; %else -; GFX9-NEXT: s_mul_i32 s6, s0, s1 +; GFX9-NEXT: s_mul_i32 s6, s2, s3 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_branch .LBB15_3 ; GFX9-NEXT: .LBB15_2: ; GFX9-NEXT: s_mov_b64 s[4:5], -1 ; GFX9-NEXT: ; implicit-def: $sgpr6 ; GFX9-NEXT: .LBB15_3: ; %Flow -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GFX9-NEXT: s_cbranch_vccnz .LBB15_5 ; GFX9-NEXT: ; %bb.4: ; %if @@ -2241,19 +2239,19 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10-LABEL: mul32_in_branch: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_cmp_lg_u32 s0, 0 +; GFX10-NEXT: s_cmp_lg_u32 s2, 0 ; GFX10-NEXT: s_cbranch_scc0 .LBB15_2 ; GFX10-NEXT: ; %bb.1: ; %else -; GFX10-NEXT: s_mul_i32 s5, s0, s1 +; GFX10-NEXT: s_mul_i32 s5, s2, s3 ; GFX10-NEXT: s_branch .LBB15_3 ; GFX10-NEXT: .LBB15_2: ; GFX10-NEXT: s_mov_b32 s4, -1 ; GFX10-NEXT: ; implicit-def: $sgpr5 ; GFX10-NEXT: .LBB15_3: ; %Flow -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_andn2_b32 vcc_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_vccnz .LBB15_5 ; GFX10-NEXT: ; %bb.4: ; %if @@ -2276,19 +2274,19 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11-LABEL: mul32_in_branch: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 ; GFX11-NEXT: s_cbranch_scc0 .LBB15_2 ; GFX11-NEXT: ; %bb.1: ; %else -; GFX11-NEXT: s_mul_i32 s5, s0, s1 +; GFX11-NEXT: s_mul_i32 s5, s2, s3 ; GFX11-NEXT: s_branch .LBB15_3 ; GFX11-NEXT: .LBB15_2: ; GFX11-NEXT: s_mov_b32 s4, -1 ; GFX11-NEXT: ; implicit-def: $sgpr5 ; GFX11-NEXT: .LBB15_3: ; %Flow -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-NEXT: s_cbranch_vccnz .LBB15_5 ; GFX11-NEXT: ; %bb.4: ; %if @@ -2313,19 +2311,19 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX12-LABEL: mul32_in_branch: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_cmp_lg_u32 s0, 0 +; GFX12-NEXT: s_cmp_lg_u32 s2, 0 ; GFX12-NEXT: s_cbranch_scc0 .LBB15_2 ; GFX12-NEXT: ; %bb.1: ; %else -; GFX12-NEXT: s_mul_i32 s5, s0, s1 +; GFX12-NEXT: s_mul_i32 s5, s2, s3 ; GFX12-NEXT: s_branch .LBB15_3 ; GFX12-NEXT: .LBB15_2: ; GFX12-NEXT: s_mov_b32 s4, -1 ; GFX12-NEXT: ; implicit-def: $sgpr5 ; GFX12-NEXT: .LBB15_3: ; %Flow -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX12-NEXT: s_cbranch_vccnz .LBB15_5 ; GFX12-NEXT: ; %bb.4: ; %if @@ -2405,7 +2403,7 @@ endif: define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %a, i64 %b, i64 %c) { ; SI-LABEL: mul64_in_branch: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b64 s[8:9], 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u64_e64 s[10:11], s[4:5], 0 @@ -2440,7 +2438,7 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: mul64_in_branch: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b64 s[8:9], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u64 s[4:5], 0 @@ -2472,7 +2470,7 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: mul64_in_branch: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[8:9], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 @@ -2508,7 +2506,7 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10-LABEL: mul64_in_branch: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX10-NEXT: s_cbranch_scc0 .LBB16_3 @@ -2542,7 +2540,7 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11-LABEL: mul64_in_branch: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX11-NEXT: s_cbranch_scc0 .LBB16_3 @@ -2577,7 +2575,7 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX12-LABEL: mul64_in_branch: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX12-NEXT: s_cbranch_scc0 .LBB16_3 @@ -2670,9 +2668,9 @@ endif: define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a, [8 x i32], i128 %b) nounwind #0 { ; SI-LABEL: s_mul_i128: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x13 -; SI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x1f -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x1f +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2719,9 +2717,9 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a, ; ; VI-LABEL: s_mul_i128: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4c -; VI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x7c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x7c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v5, 0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2758,96 +2756,96 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a, ; ; GFX9-LABEL: s_mul_i128: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4c -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x7c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x4c +; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x7c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s7, s8, s7 -; GFX9-NEXT: s_mul_hi_u32 s12, s8, s6 -; GFX9-NEXT: s_add_i32 s7, s12, s7 -; GFX9-NEXT: s_mul_i32 s12, s9, s6 -; GFX9-NEXT: s_add_i32 s7, s7, s12 -; GFX9-NEXT: s_mul_i32 s12, s10, s5 -; GFX9-NEXT: s_mul_hi_u32 s13, s10, s4 -; GFX9-NEXT: s_add_i32 s12, s13, s12 -; GFX9-NEXT: s_mul_i32 s11, s11, s4 -; GFX9-NEXT: s_mul_i32 s6, s8, s6 -; GFX9-NEXT: s_add_i32 s12, s12, s11 -; GFX9-NEXT: s_mul_i32 s10, s10, s4 -; GFX9-NEXT: s_add_u32 s10, s10, s6 -; GFX9-NEXT: s_addc_u32 s11, s12, s7 -; GFX9-NEXT: s_mul_i32 s14, s5, s8 -; GFX9-NEXT: s_mul_hi_u32 s15, s4, s8 -; GFX9-NEXT: s_mul_hi_u32 s13, s5, s8 +; GFX9-NEXT: s_mul_i32 s0, s12, s11 +; GFX9-NEXT: s_mul_hi_u32 s1, s12, s10 +; GFX9-NEXT: s_mul_i32 s2, s14, s9 +; GFX9-NEXT: s_mul_hi_u32 s3, s14, s8 +; GFX9-NEXT: s_add_i32 s0, s1, s0 +; GFX9-NEXT: s_mul_i32 s1, s13, s10 +; GFX9-NEXT: s_add_i32 s2, s3, s2 +; GFX9-NEXT: s_mul_i32 s3, s15, s8 +; GFX9-NEXT: s_add_i32 s0, s0, s1 +; GFX9-NEXT: s_mul_i32 s1, s12, s10 +; GFX9-NEXT: s_add_i32 s2, s2, s3 +; GFX9-NEXT: s_mul_i32 s3, s14, s8 +; GFX9-NEXT: s_add_u32 s3, s3, s1 +; GFX9-NEXT: s_addc_u32 s2, s2, s0 +; GFX9-NEXT: s_mul_i32 s14, s9, s12 +; GFX9-NEXT: s_mul_hi_u32 s15, s8, s12 +; GFX9-NEXT: s_mul_hi_u32 s11, s9, s12 ; GFX9-NEXT: s_add_u32 s14, s14, s15 -; GFX9-NEXT: s_mul_i32 s7, s4, s9 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 -; GFX9-NEXT: s_mul_hi_u32 s12, s4, s9 -; GFX9-NEXT: s_add_u32 s7, s7, s14 -; GFX9-NEXT: s_addc_u32 s12, s12, 0 -; GFX9-NEXT: s_add_u32 s12, s13, s12 -; GFX9-NEXT: s_addc_u32 s13, 0, 0 -; GFX9-NEXT: s_mul_hi_u32 s14, s5, s9 -; GFX9-NEXT: s_mul_i32 s5, s5, s9 -; GFX9-NEXT: s_add_u32 s5, s5, s12 -; GFX9-NEXT: s_mov_b32 s6, 0 -; GFX9-NEXT: s_addc_u32 s9, s14, s13 -; GFX9-NEXT: s_add_u32 s10, s5, s10 -; GFX9-NEXT: s_mul_i32 s4, s4, s8 -; GFX9-NEXT: s_mov_b32 s5, s6 -; GFX9-NEXT: s_addc_u32 s9, s9, s11 -; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-NEXT: v_mov_b32_e32 v3, s9 -; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX9-NEXT: s_mul_i32 s1, s8, s13 +; GFX9-NEXT: s_addc_u32 s11, s11, 0 +; GFX9-NEXT: s_mul_hi_u32 s10, s8, s13 +; GFX9-NEXT: s_add_u32 s1, s1, s14 +; GFX9-NEXT: s_addc_u32 s10, s10, 0 +; GFX9-NEXT: s_add_u32 s10, s11, s10 +; GFX9-NEXT: s_addc_u32 s11, 0, 0 +; GFX9-NEXT: s_mul_hi_u32 s14, s9, s13 +; GFX9-NEXT: s_mul_i32 s9, s9, s13 +; GFX9-NEXT: s_add_u32 s9, s9, s10 +; GFX9-NEXT: s_addc_u32 s10, s14, s11 +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: s_add_u32 s9, s9, s3 +; GFX9-NEXT: s_addc_u32 s10, s10, s2 +; GFX9-NEXT: s_mul_i32 s2, s8, s12 +; GFX9-NEXT: s_mov_b32 s3, s0 +; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_mul_i128: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4c -; GFX10-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x7c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX10-NEXT: s_mov_b32 s12, 0 -; GFX10-NEXT: s_mov_b32 s3, s12 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4c +; GFX10-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x7c +; GFX10-NEXT: s_mov_b32 s2, 0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_mov_b32 s13, s2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mul_i32 s2, s8, s7 +; GFX10-NEXT: s_mul_i32 s3, s8, s7 ; GFX10-NEXT: s_mul_hi_u32 s7, s8, s6 ; GFX10-NEXT: s_mul_i32 s14, s10, s5 ; GFX10-NEXT: s_mul_hi_u32 s15, s10, s4 -; GFX10-NEXT: s_mul_i32 s13, s9, s6 +; GFX10-NEXT: s_mul_i32 s12, s9, s6 ; GFX10-NEXT: s_mul_i32 s11, s11, s4 -; GFX10-NEXT: s_add_i32 s2, s7, s2 +; GFX10-NEXT: s_add_i32 s3, s7, s3 ; GFX10-NEXT: s_add_i32 s7, s15, s14 ; GFX10-NEXT: s_mul_i32 s6, s8, s6 ; GFX10-NEXT: s_mul_i32 s10, s10, s4 -; GFX10-NEXT: s_add_i32 s2, s2, s13 +; GFX10-NEXT: s_add_i32 s3, s3, s12 ; GFX10-NEXT: s_add_i32 s7, s7, s11 ; GFX10-NEXT: s_mul_i32 s19, s5, s8 ; GFX10-NEXT: s_mul_hi_u32 s20, s4, s8 ; GFX10-NEXT: s_add_u32 s6, s10, s6 ; GFX10-NEXT: s_mul_hi_u32 s18, s5, s8 -; GFX10-NEXT: s_addc_u32 s7, s7, s2 +; GFX10-NEXT: s_addc_u32 s7, s7, s3 ; GFX10-NEXT: s_mul_i32 s17, s4, s9 -; GFX10-NEXT: s_add_u32 s2, s19, s20 +; GFX10-NEXT: s_add_u32 s3, s19, s20 ; GFX10-NEXT: s_mul_hi_u32 s16, s4, s9 ; GFX10-NEXT: s_mul_hi_u32 s21, s5, s9 ; GFX10-NEXT: s_mul_i32 s5, s5, s9 ; GFX10-NEXT: s_addc_u32 s9, s18, 0 -; GFX10-NEXT: s_add_u32 s13, s17, s2 +; GFX10-NEXT: s_add_u32 s3, s17, s3 ; GFX10-NEXT: s_addc_u32 s10, s16, 0 -; GFX10-NEXT: s_mul_i32 s2, s4, s8 +; GFX10-NEXT: s_mul_i32 s12, s4, s8 ; GFX10-NEXT: s_add_u32 s4, s9, s10 ; GFX10-NEXT: s_addc_u32 s8, 0, 0 ; GFX10-NEXT: s_add_u32 s4, s5, s4 ; GFX10-NEXT: s_addc_u32 s5, s21, s8 ; GFX10-NEXT: s_add_u32 s4, s4, s6 ; GFX10-NEXT: s_addc_u32 s5, s5, s7 -; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[12:13] +; GFX10-NEXT: s_or_b64 s[2:3], s[12:13], s[2:3] ; GFX10-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s3 @@ -2860,46 +2858,46 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a, ; GFX11-LABEL: s_mul_i128: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x4c -; GFX11-NEXT: s_load_b128 s[8:11], s[2:3], 0x7c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s12, 0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x4c +; GFX11-NEXT: s_load_b128 s[8:11], s[0:1], 0x7c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s2, 0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_mov_b32 s3, s12 +; GFX11-NEXT: s_mov_b32 s13, s2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mul_i32 s2, s8, s7 +; GFX11-NEXT: s_mul_i32 s3, s8, s7 ; GFX11-NEXT: s_mul_hi_u32 s7, s8, s6 ; GFX11-NEXT: s_mul_i32 s14, s10, s5 ; GFX11-NEXT: s_mul_hi_u32 s15, s10, s4 -; GFX11-NEXT: s_mul_i32 s13, s9, s6 +; GFX11-NEXT: s_mul_i32 s12, s9, s6 ; GFX11-NEXT: s_mul_i32 s11, s11, s4 -; GFX11-NEXT: s_add_i32 s2, s7, s2 +; GFX11-NEXT: s_add_i32 s3, s7, s3 ; GFX11-NEXT: s_add_i32 s7, s15, s14 ; GFX11-NEXT: s_mul_i32 s6, s8, s6 ; GFX11-NEXT: s_mul_i32 s10, s10, s4 -; GFX11-NEXT: s_add_i32 s2, s2, s13 +; GFX11-NEXT: s_add_i32 s3, s3, s12 ; GFX11-NEXT: s_add_i32 s7, s7, s11 ; GFX11-NEXT: s_mul_i32 s19, s5, s8 ; GFX11-NEXT: s_mul_hi_u32 s20, s4, s8 ; GFX11-NEXT: s_add_u32 s6, s10, s6 ; GFX11-NEXT: s_mul_hi_u32 s18, s5, s8 -; GFX11-NEXT: s_addc_u32 s7, s7, s2 +; GFX11-NEXT: s_addc_u32 s7, s7, s3 ; GFX11-NEXT: s_mul_i32 s17, s4, s9 -; GFX11-NEXT: s_add_u32 s2, s19, s20 +; GFX11-NEXT: s_add_u32 s3, s19, s20 ; GFX11-NEXT: s_mul_hi_u32 s16, s4, s9 ; GFX11-NEXT: s_mul_hi_u32 s21, s5, s9 ; GFX11-NEXT: s_mul_i32 s5, s5, s9 ; GFX11-NEXT: s_addc_u32 s9, s18, 0 -; GFX11-NEXT: s_add_u32 s13, s17, s2 +; GFX11-NEXT: s_add_u32 s3, s17, s3 ; GFX11-NEXT: s_addc_u32 s10, s16, 0 -; GFX11-NEXT: s_mul_i32 s2, s4, s8 +; GFX11-NEXT: s_mul_i32 s12, s4, s8 ; GFX11-NEXT: s_add_u32 s4, s9, s10 ; GFX11-NEXT: s_addc_u32 s8, 0, 0 ; GFX11-NEXT: s_add_u32 s4, s5, s4 ; GFX11-NEXT: s_addc_u32 s5, s21, s8 ; GFX11-NEXT: s_add_u32 s4, s4, s6 ; GFX11-NEXT: s_addc_u32 s5, s5, s7 -; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[12:13] +; GFX11-NEXT: s_or_b64 s[2:3], s[12:13], s[2:3] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s5 @@ -2913,40 +2911,40 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a, ; GFX12-LABEL: s_mul_i128: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x7c -; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x4c -; GFX12-NEXT: s_mov_b32 s13, 0 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-NEXT: s_mov_b32 s15, s13 -; GFX12-NEXT: s_mov_b32 s3, s13 -; GFX12-NEXT: s_mov_b32 s17, s13 -; GFX12-NEXT: s_mov_b32 s19, s13 -; GFX12-NEXT: s_mov_b32 s24, s13 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x7c +; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x4c +; GFX12-NEXT: s_mov_b32 s3, 0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_mov_b32 s15, s3 +; GFX12-NEXT: s_mov_b32 s13, s3 +; GFX12-NEXT: s_mov_b32 s17, s3 +; GFX12-NEXT: s_mov_b32 s19, s3 +; GFX12-NEXT: s_mov_b32 s24, s3 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_mov_b32 s12, s4 +; GFX12-NEXT: s_mov_b32 s2, s4 ; GFX12-NEXT: s_mov_b32 s14, s8 -; GFX12-NEXT: s_mov_b32 s2, s9 -; GFX12-NEXT: s_mul_u64 s[22:23], s[14:15], s[12:13] -; GFX12-NEXT: s_mul_u64 s[20:21], s[2:3], s[12:13] -; GFX12-NEXT: s_mov_b32 s12, s23 +; GFX12-NEXT: s_mov_b32 s12, s9 +; GFX12-NEXT: s_mul_u64 s[22:23], s[14:15], s[2:3] +; GFX12-NEXT: s_mul_u64 s[20:21], s[12:13], s[2:3] +; GFX12-NEXT: s_mov_b32 s2, s23 ; GFX12-NEXT: s_mov_b32 s16, s5 ; GFX12-NEXT: s_mul_u64 s[4:5], s[4:5], s[10:11] -; GFX12-NEXT: s_add_nc_u64 s[10:11], s[20:21], s[12:13] +; GFX12-NEXT: s_add_nc_u64 s[10:11], s[20:21], s[2:3] ; GFX12-NEXT: s_mul_u64 s[6:7], s[6:7], s[8:9] ; GFX12-NEXT: s_mul_u64 s[8:9], s[14:15], s[16:17] -; GFX12-NEXT: s_mov_b32 s12, s11 -; GFX12-NEXT: s_mov_b32 s11, s13 +; GFX12-NEXT: s_mov_b32 s2, s11 +; GFX12-NEXT: s_mov_b32 s11, s3 ; GFX12-NEXT: s_add_nc_u64 s[4:5], s[6:7], s[4:5] ; GFX12-NEXT: s_add_nc_u64 s[6:7], s[8:9], s[10:11] -; GFX12-NEXT: s_mul_u64 s[2:3], s[2:3], s[16:17] +; GFX12-NEXT: s_mul_u64 s[12:13], s[12:13], s[16:17] ; GFX12-NEXT: s_mov_b32 s18, s7 +; GFX12-NEXT: s_mov_b32 s23, s3 +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[18:19] ; GFX12-NEXT: s_mov_b32 s25, s6 -; GFX12-NEXT: s_add_nc_u64 s[6:7], s[12:13], s[18:19] -; GFX12-NEXT: s_mov_b32 s23, s13 -; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[6:7] -; GFX12-NEXT: s_or_b64 s[8:9], s[22:23], s[24:25] +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[12:13], s[2:3] +; GFX12-NEXT: s_or_b64 s[6:7], s[22:23], s[24:25] ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] -; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: s_mov_b32 s3, 0x31016000 ; GFX12-NEXT: s_mov_b32 s2, -1 @@ -3013,7 +3011,7 @@ entry: define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) #0 { ; SI-LABEL: v_mul_i128: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0xb +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v8, 4, v0 @@ -3062,7 +3060,7 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a ; ; VI-LABEL: v_mul_i128: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v2, 4, v0 ; VI-NEXT: v_mov_b32_e32 v11, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3102,7 +3100,7 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a ; ; GFX9-LABEL: v_mul_i128: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v13, 4, v0 ; GFX9-NEXT: v_mov_b32_e32 v10, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -3133,7 +3131,7 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a ; ; GFX10-LABEL: v_mul_i128: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c ; GFX10-NEXT: v_lshlrev_b32_e32 v13, 4, v0 ; GFX10-NEXT: v_mov_b32_e32 v10, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -3165,9 +3163,7 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a ; ; GFX11-LABEL: v_mul_i128: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x2c -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x2c ; GFX11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_lshlrev_b32 v15, 4, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -3205,9 +3201,7 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a ; ; GFX12-LABEL: v_mul_i128: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x2c -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x2c ; GFX12-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_lshlrev_b32 v13, 4, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 diff --git a/llvm/test/CodeGen/AMDGPU/mul_int24.ll b/llvm/test/CodeGen/AMDGPU/mul_int24.ll index 842dc36e00154..58fd4b9bd2fee 100644 --- a/llvm/test/CodeGen/AMDGPU/mul_int24.ll +++ b/llvm/test/CodeGen/AMDGPU/mul_int24.ll @@ -9,7 +9,7 @@ define amdgpu_kernel void @test_smul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { ; SI-LABEL: test_smul24_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bfe_i32 s2, s2, 0x180000 @@ -24,7 +24,7 @@ define amdgpu_kernel void @test_smul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b ; ; VI-LABEL: test_smul24_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -39,17 +39,17 @@ define amdgpu_kernel void @test_smul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b ; ; GFX9-LABEL: test_smul24_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 -; GFX9-NEXT: s_bfe_i32 s0, s2, 0x180000 -; GFX9-NEXT: s_bfe_i32 s1, s3, 0x180000 -; GFX9-NEXT: s_mul_i32 s0, s0, s1 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_bfe_i32 s4, s6, 0x180000 +; GFX9-NEXT: s_bfe_i32 s5, s7, 0x180000 +; GFX9-NEXT: s_mul_i32 s4, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: test_smul24_i32: @@ -100,7 +100,7 @@ entry: define amdgpu_kernel void @test_smulhi24_i64(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { ; SI-LABEL: test_smulhi24_i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -113,7 +113,7 @@ define amdgpu_kernel void @test_smulhi24_i64(ptr addrspace(1) %out, i32 %a, i32 ; ; VI-LABEL: test_smulhi24_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -126,17 +126,17 @@ define amdgpu_kernel void @test_smulhi24_i64(ptr addrspace(1) %out, i32 %a, i32 ; ; GFX9-LABEL: test_smulhi24_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 -; GFX9-NEXT: s_bfe_i32 s0, s2, 0x180000 -; GFX9-NEXT: s_bfe_i32 s1, s3, 0x180000 -; GFX9-NEXT: s_mul_hi_i32 s0, s0, s1 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_bfe_i32 s4, s6, 0x180000 +; GFX9-NEXT: s_bfe_i32 s5, s7, 0x180000 +; GFX9-NEXT: s_mul_hi_i32 s4, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: test_smulhi24_i64: @@ -274,26 +274,26 @@ define <2 x i64> @test_smul48_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { define amdgpu_kernel void @test_smul24_i64(ptr addrspace(1) %out, [8 x i32], i32 %a, [8 x i32], i32 %b) #0 { ; SI-LABEL: test_smul24_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0x13 -; SI-NEXT: s_load_dword s5, s[2:3], 0x1c -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0x13 +; SI-NEXT: s_load_dword s0, s[0:1], 0x1c +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_i32 s4, s4, 0x180000 -; SI-NEXT: s_bfe_i32 s5, s5, 0x180000 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: s_mul_i32 s4, s5, s4 -; SI-NEXT: v_mul_hi_i32_i24_e32 v1, s5, v0 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_bfe_i32 s1, s2, 0x180000 +; SI-NEXT: s_bfe_i32 s0, s0, 0x180000 +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: s_mul_i32 s1, s0, s1 +; SI-NEXT: v_mul_hi_i32_i24_e32 v1, s0, v0 +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_smul24_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x4c -; VI-NEXT: s_load_dword s5, s[2:3], 0x70 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x4c +; VI-NEXT: s_load_dword s5, s[0:1], 0x70 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -307,19 +307,19 @@ define amdgpu_kernel void @test_smul24_i64(ptr addrspace(1) %out, [8 x i32], i32 ; ; GFX9-LABEL: test_smul24_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x4c -; GFX9-NEXT: s_load_dword s5, s[2:3], 0x70 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x4c +; GFX9-NEXT: s_load_dword s3, s[0:1], 0x70 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s4, s4, 0x180000 -; GFX9-NEXT: s_bfe_i32 s5, s5, 0x180000 -; GFX9-NEXT: s_mul_hi_i32 s6, s5, s4 -; GFX9-NEXT: s_mul_i32 s5, s5, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s5 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: s_bfe_i32 s0, s2, 0x180000 +; GFX9-NEXT: s_bfe_i32 s1, s3, 0x180000 +; GFX9-NEXT: s_mul_hi_i32 s2, s1, s0 +; GFX9-NEXT: s_mul_i32 s1, s1, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: test_smul24_i64: @@ -376,8 +376,8 @@ define amdgpu_kernel void @test_smul24_i64(ptr addrspace(1) %out, [8 x i32], i32 define amdgpu_kernel void @test_smul24_i64_square(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { ; SI-LABEL: test_smul24_i64_square: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -390,8 +390,8 @@ define amdgpu_kernel void @test_smul24_i64_square(ptr addrspace(1) %out, i32 %a, ; ; VI-LABEL: test_smul24_i64_square: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -403,17 +403,17 @@ define amdgpu_kernel void @test_smul24_i64_square(ptr addrspace(1) %out, i32 %a, ; ; GFX9-LABEL: test_smul24_i64_square: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s4, s4, 0x180000 -; GFX9-NEXT: s_mul_hi_i32 s5, s4, s4 -; GFX9-NEXT: s_mul_i32 s4, s4, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: s_bfe_i32 s0, s2, 0x180000 +; GFX9-NEXT: s_mul_hi_i32 s1, s0, s0 +; GFX9-NEXT: s_mul_i32 s0, s0, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: test_smul24_i64_square: @@ -463,33 +463,33 @@ define amdgpu_kernel void @test_smul24_i64_square(ptr addrspace(1) %out, i32 %a, define amdgpu_kernel void @test_smul24_i33(ptr addrspace(1) %out, i33 %a, i33 %b) #0 { ; SI-LABEL: test_smul24_i33: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dword s6, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dword s0, s[0:1], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s5, s4, 8 -; SI-NEXT: s_lshl_b32 s7, s6, 8 -; SI-NEXT: s_ashr_i64 s[6:7], s[6:7], 40 -; SI-NEXT: s_ashr_i64 s[4:5], s[4:5], 40 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: s_mul_i32 s5, s4, s6 -; SI-NEXT: v_mul_hi_i32_i24_e32 v1, s4, v0 -; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: s_lshl_b32 s1, s2, 8 +; SI-NEXT: s_lshl_b32 s3, s0, 8 +; SI-NEXT: s_ashr_i64 s[2:3], s[2:3], 40 +; SI-NEXT: s_ashr_i64 s[0:1], s[0:1], 40 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: s_mul_i32 s1, s0, s2 +; SI-NEXT: v_mul_hi_i32_i24_e32 v1, s0, v0 +; SI-NEXT: v_mov_b32_e32 v0, s1 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 31 ; SI-NEXT: v_ashr_i64 v[0:1], v[0:1], 31 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_smul24_i33: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dword s5, s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dword s4, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s3, s4, 8 -; VI-NEXT: s_lshl_b32 s5, s5, 8 +; VI-NEXT: s_lshl_b32 s3, s2, 8 +; VI-NEXT: s_lshl_b32 s5, s4, 8 ; VI-NEXT: s_ashr_i64 s[4:5], s[4:5], 40 ; VI-NEXT: s_ashr_i64 s[2:3], s[2:3], 40 ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -504,23 +504,23 @@ define amdgpu_kernel void @test_smul24_i33(ptr addrspace(1) %out, i33 %a, i33 %b ; ; GFX9-LABEL: test_smul24_i33: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x34 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s3, s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s5, s4, 8 -; GFX9-NEXT: s_ashr_i64 s[4:5], s[4:5], 40 -; GFX9-NEXT: s_lshl_b32 s5, s6, 8 -; GFX9-NEXT: s_ashr_i64 s[6:7], s[4:5], 40 -; GFX9-NEXT: s_mul_hi_i32 s5, s4, s6 -; GFX9-NEXT: s_mul_i32 s4, s4, s6 -; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 31 -; GFX9-NEXT: s_ashr_i64 s[4:5], s[4:5], 31 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: s_lshl_b32 s1, s2, 8 +; GFX9-NEXT: s_ashr_i64 s[0:1], s[0:1], 40 +; GFX9-NEXT: s_lshl_b32 s1, s3, 8 +; GFX9-NEXT: s_ashr_i64 s[2:3], s[0:1], 40 +; GFX9-NEXT: s_mul_hi_i32 s1, s0, s2 +; GFX9-NEXT: s_mul_i32 s0, s0, s2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 31 +; GFX9-NEXT: s_ashr_i64 s[0:1], s[0:1], 31 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: test_smul24_i33: @@ -580,9 +580,9 @@ entry: define amdgpu_kernel void @test_smulhi24_i33(ptr addrspace(1) %out, i33 %a, i33 %b) { ; SI-LABEL: test_smulhi24_i33: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xd -; SI-NEXT: s_load_dword s5, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xd +; SI-NEXT: s_load_dword s5, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -594,9 +594,9 @@ define amdgpu_kernel void @test_smulhi24_i33(ptr addrspace(1) %out, i33 %a, i33 ; ; VI-LABEL: test_smulhi24_i33: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x34 -; VI-NEXT: s_load_dword s5, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x34 +; VI-NEXT: s_load_dword s5, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -608,20 +608,20 @@ define amdgpu_kernel void @test_smulhi24_i33(ptr addrspace(1) %out, i33 %a, i33 ; ; GFX9-LABEL: test_smulhi24_i33: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x34 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s3, s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s5, s4, 8 -; GFX9-NEXT: s_ashr_i64 s[4:5], s[4:5], 40 -; GFX9-NEXT: s_lshl_b32 s5, s6, 8 -; GFX9-NEXT: s_ashr_i64 s[6:7], s[4:5], 40 -; GFX9-NEXT: s_mul_hi_i32 s4, s4, s6 -; GFX9-NEXT: s_and_b32 s4, s4, 1 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_lshl_b32 s1, s2, 8 +; GFX9-NEXT: s_ashr_i64 s[0:1], s[0:1], 40 +; GFX9-NEXT: s_lshl_b32 s1, s3, 8 +; GFX9-NEXT: s_ashr_i64 s[2:3], s[0:1], 40 +; GFX9-NEXT: s_mul_hi_i32 s0, s0, s2 +; GFX9-NEXT: s_and_b32 s0, s0, 1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: test_smulhi24_i33: @@ -672,15 +672,15 @@ entry: define amdgpu_kernel void @simplify_i24_crash(ptr addrspace(1) %out, i32 %arg0, <2 x i32> %arg1, <2 x i32> %arg2) { ; SI-LABEL: simplify_i24_crash: ; SI: ; %bb.0: ; %bb -; SI-NEXT: s_load_dword s0, s[2:3], 0xb +; SI-NEXT: s_load_dword s2, s[0:1], 0xb ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s0, 0 +; SI-NEXT: s_cmp_lg_u32 s2, 0 ; SI-NEXT: s_cbranch_scc0 .LBB8_2 ; SI-NEXT: ; %bb.1: ; %bb7 ; SI-NEXT: s_endpgm ; SI-NEXT: .LBB8_2: ; %bb11 -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bfe_i32 s2, s4, 0x180000 @@ -694,15 +694,15 @@ define amdgpu_kernel void @simplify_i24_crash(ptr addrspace(1) %out, i32 %arg0, ; ; VI-LABEL: simplify_i24_crash: ; VI: ; %bb.0: ; %bb -; VI-NEXT: s_load_dword s0, s[2:3], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s0, 0 +; VI-NEXT: s_cmp_lg_u32 s2, 0 ; VI-NEXT: s_cbranch_scc0 .LBB8_2 ; VI-NEXT: ; %bb.1: ; %bb7 ; VI-NEXT: s_endpgm ; VI-NEXT: .LBB8_2: ; %bb11 -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -716,24 +716,24 @@ define amdgpu_kernel void @simplify_i24_crash(ptr addrspace(1) %out, i32 %arg0, ; ; GFX9-LABEL: simplify_i24_crash: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_cmp_lg_u32 s0, 0 +; GFX9-NEXT: s_cmp_lg_u32 s2, 0 ; GFX9-NEXT: s_cbranch_scc0 .LBB8_2 ; GFX9-NEXT: ; %bb.1: ; %bb7 ; GFX9-NEXT: s_endpgm ; GFX9-NEXT: .LBB8_2: ; %bb11 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s11, 0xf000 +; GFX9-NEXT: s_mov_b32 s10, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s4, s4, 0x180000 -; GFX9-NEXT: s_bfe_i32 s5, s6, 0x180000 -; GFX9-NEXT: s_mul_i32 s4, s4, s5 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: s_bfe_i32 s0, s4, 0x180000 +; GFX9-NEXT: s_bfe_i32 s1, s6, 0x180000 +; GFX9-NEXT: s_mul_i32 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: simplify_i24_crash: diff --git a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll index 0c0bb830ba847..698a54de108f7 100644 --- a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll +++ b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll @@ -9,7 +9,7 @@ declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone define amdgpu_kernel void @test_umul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; SI-LABEL: test_umul24_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_and_b32 s2, s2, 0xffffff @@ -24,7 +24,7 @@ define amdgpu_kernel void @test_umul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b ; ; VI-LABEL: test_umul24_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -39,17 +39,17 @@ define amdgpu_kernel void @test_umul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b ; ; GFX9-LABEL: test_umul24_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 -; GFX9-NEXT: s_and_b32 s0, s2, 0xffffff -; GFX9-NEXT: s_and_b32 s1, s3, 0xffffff -; GFX9-NEXT: s_mul_i32 s0, s0, s1 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_and_b32 s4, s6, 0xffffff +; GFX9-NEXT: s_and_b32 s5, s7, 0xffffff +; GFX9-NEXT: s_mul_i32 s4, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm entry: %0 = shl i32 %a, 8 @@ -64,13 +64,13 @@ entry: define amdgpu_kernel void @test_umul24_i16_sext(ptr addrspace(1) %out, i16 %a, i16 %b) { ; SI-LABEL: test_umul24_i16_sext: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s2, s4, 16 -; SI-NEXT: s_mul_i32 s4, s4, s2 -; SI-NEXT: s_sext_i32_i16 s4, s4 +; SI-NEXT: s_lshr_b32 s4, s2, 16 +; SI-NEXT: s_mul_i32 s2, s2, s4 +; SI-NEXT: s_sext_i32_i16 s4, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -78,8 +78,8 @@ define amdgpu_kernel void @test_umul24_i16_sext(ptr addrspace(1) %out, i16 %a, i ; ; VI-LABEL: test_umul24_i16_sext: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -92,16 +92,16 @@ define amdgpu_kernel void @test_umul24_i16_sext(ptr addrspace(1) %out, i16 %a, i ; ; GFX9-LABEL: test_umul24_i16_sext: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s5, s4, 16 -; GFX9-NEXT: s_mul_i32 s4, s4, s5 -; GFX9-NEXT: s_sext_i32_i16 s4, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_lshr_b32 s0, s2, 16 +; GFX9-NEXT: s_mul_i32 s2, s2, s0 +; GFX9-NEXT: s_sext_i32_i16 s0, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm entry: %mul = mul i16 %a, %b @@ -113,7 +113,7 @@ entry: define amdgpu_kernel void @test_umul24_i16_vgpr_sext(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: test_umul24_i16_vgpr_sext: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 1, v0 @@ -136,7 +136,7 @@ define amdgpu_kernel void @test_umul24_i16_vgpr_sext(ptr addrspace(1) %out, ptr ; ; VI-LABEL: test_umul24_i16_vgpr_sext: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s3 @@ -158,7 +158,7 @@ define amdgpu_kernel void @test_umul24_i16_vgpr_sext(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: test_umul24_i16_vgpr_sext: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -186,13 +186,13 @@ define amdgpu_kernel void @test_umul24_i16_vgpr_sext(ptr addrspace(1) %out, ptr define amdgpu_kernel void @test_umul24_i16(ptr addrspace(1) %out, i16 %a, i16 %b) { ; SI-LABEL: test_umul24_i16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s2, s4, 16 -; SI-NEXT: s_mul_i32 s4, s4, s2 -; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshr_b32 s4, s2, 16 +; SI-NEXT: s_mul_i32 s2, s2, s4 +; SI-NEXT: s_and_b32 s4, s2, 0xffff ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -200,8 +200,8 @@ define amdgpu_kernel void @test_umul24_i16(ptr addrspace(1) %out, i16 %a, i16 %b ; ; VI-LABEL: test_umul24_i16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -214,16 +214,16 @@ define amdgpu_kernel void @test_umul24_i16(ptr addrspace(1) %out, i16 %a, i16 %b ; ; GFX9-LABEL: test_umul24_i16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s5, s4, 16 -; GFX9-NEXT: s_mul_i32 s4, s4, s5 -; GFX9-NEXT: s_and_b32 s4, s4, 0xffff -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_lshr_b32 s0, s2, 16 +; GFX9-NEXT: s_mul_i32 s2, s2, s0 +; GFX9-NEXT: s_and_b32 s0, s2, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm entry: %mul = mul i16 %a, %b @@ -235,7 +235,7 @@ entry: define amdgpu_kernel void @test_umul24_i16_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: test_umul24_i16_vgpr: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 1, v0 @@ -258,7 +258,7 @@ define amdgpu_kernel void @test_umul24_i16_vgpr(ptr addrspace(1) %out, ptr addrs ; ; VI-LABEL: test_umul24_i16_vgpr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s3 @@ -279,7 +279,7 @@ define amdgpu_kernel void @test_umul24_i16_vgpr(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: test_umul24_i16_vgpr: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -307,8 +307,8 @@ define amdgpu_kernel void @test_umul24_i8_vgpr(ptr addrspace(1) %out, ptr addrsp ; SI-LABEL: test_umul24_i8_vgpr: ; SI: ; %bb.0: ; %entry ; SI-NEXT: v_mov_b32_e32 v3, v0 -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: v_mov_b32_e32 v4, 0 @@ -330,8 +330,8 @@ define amdgpu_kernel void @test_umul24_i8_vgpr(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: test_umul24_i8_vgpr: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v0 @@ -351,11 +351,11 @@ define amdgpu_kernel void @test_umul24_i8_vgpr(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: test_umul24_i8_vgpr: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v2, v0, s[6:7] -; GFX9-NEXT: global_load_ubyte v3, v1, s[0:1] +; GFX9-NEXT: global_load_ubyte v3, v1, s[2:3] ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -379,7 +379,7 @@ entry: define amdgpu_kernel void @test_umulhi24_i32_i64(ptr addrspace(1) %out, i32 %a, i32 %b) { ; SI-LABEL: test_umulhi24_i32_i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -392,7 +392,7 @@ define amdgpu_kernel void @test_umulhi24_i32_i64(ptr addrspace(1) %out, i32 %a, ; ; VI-LABEL: test_umulhi24_i32_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -405,17 +405,17 @@ define amdgpu_kernel void @test_umulhi24_i32_i64(ptr addrspace(1) %out, i32 %a, ; ; GFX9-LABEL: test_umulhi24_i32_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 -; GFX9-NEXT: s_and_b32 s0, s2, 0xffffff -; GFX9-NEXT: s_and_b32 s1, s3, 0xffffff -; GFX9-NEXT: s_mul_hi_u32 s0, s0, s1 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_and_b32 s4, s6, 0xffffff +; GFX9-NEXT: s_and_b32 s5, s7, 0xffffff +; GFX9-NEXT: s_mul_hi_u32 s4, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm entry: %a.24 = and i32 %a, 16777215 @@ -432,9 +432,9 @@ entry: define amdgpu_kernel void @test_umulhi24(ptr addrspace(1) %out, i64 %a, i64 %b) { ; SI-LABEL: test_umulhi24: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dword s7, s[2:3], 0xd +; SI-NEXT: s_load_dword s7, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s0, s4 @@ -447,9 +447,9 @@ define amdgpu_kernel void @test_umulhi24(ptr addrspace(1) %out, i64 %a, i64 %b) ; ; VI-LABEL: test_umulhi24: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s7, s[2:3], 0x34 +; VI-NEXT: s_load_dword s7, s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s4 @@ -462,18 +462,19 @@ define amdgpu_kernel void @test_umulhi24(ptr addrspace(1) %out, i64 %a, i64 %b) ; ; GFX9-LABEL: test_umulhi24: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_mov_b32 s11, 0xf000 -; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s1, s6, 0xffffff -; GFX9-NEXT: s_and_b32 s0, s0, 0xffffff -; GFX9-NEXT: s_mul_hi_u32 s0, s1, s0 -; GFX9-NEXT: s_mov_b32 s8, s4 -; GFX9-NEXT: s_mov_b32 s9, s5 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GFX9-NEXT: s_load_dword s7, s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_and_b32 s4, s6, 0xffffff +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_and_b32 s5, s7, 0xffffff +; GFX9-NEXT: s_mul_hi_u32 s4, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm entry: %a.24 = and i64 %a, 16777215 @@ -489,9 +490,9 @@ entry: define amdgpu_kernel void @test_umul24_i64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; SI-LABEL: test_umul24_i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dword s7, s[2:3], 0xd +; SI-NEXT: s_load_dword s7, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s0, s4 @@ -508,9 +509,9 @@ define amdgpu_kernel void @test_umul24_i64(ptr addrspace(1) %out, i64 %a, i64 %b ; ; VI-LABEL: test_umul24_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s7, s[2:3], 0x34 +; VI-NEXT: s_load_dword s7, s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s4 @@ -524,20 +525,21 @@ define amdgpu_kernel void @test_umul24_i64(ptr addrspace(1) %out, i64 %a, i64 %b ; ; GFX9-LABEL: test_umul24_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_mov_b32 s11, 0xf000 -; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s7, s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_and_b32 s4, s6, 0xffffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s1, s6, 0xffffff -; GFX9-NEXT: s_and_b32 s0, s0, 0xffffff -; GFX9-NEXT: s_mul_hi_u32 s2, s1, s0 -; GFX9-NEXT: s_mul_i32 s1, s1, s0 -; GFX9-NEXT: s_mov_b32 s8, s4 -; GFX9-NEXT: s_mov_b32 s9, s5 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GFX9-NEXT: s_and_b32 s5, s7, 0xffffff +; GFX9-NEXT: s_mul_hi_u32 s6, s4, s5 +; GFX9-NEXT: s_mul_i32 s4, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm entry: %tmp0 = shl i64 %a, 40 @@ -580,8 +582,8 @@ define <2 x i64> @test_umul48_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { define amdgpu_kernel void @test_umul24_i64_square(ptr addrspace(1) %out, [8 x i32], i64 %a) { ; SI-LABEL: test_umul24_i64_square: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -594,8 +596,8 @@ define amdgpu_kernel void @test_umul24_i64_square(ptr addrspace(1) %out, [8 x i3 ; ; VI-LABEL: test_umul24_i64_square: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -606,17 +608,17 @@ define amdgpu_kernel void @test_umul24_i64_square(ptr addrspace(1) %out, [8 x i3 ; ; GFX9-LABEL: test_umul24_i64_square: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x4c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x4c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s4, s4, 0xffffff -; GFX9-NEXT: s_mul_hi_u32 s5, s4, s4 -; GFX9-NEXT: s_mul_i32 s4, s4, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: s_and_b32 s0, s2, 0xffffff +; GFX9-NEXT: s_mul_hi_u32 s1, s0, s0 +; GFX9-NEXT: s_mul_i32 s0, s0, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm entry: %tmp0 = shl i64 %a, 40 @@ -629,7 +631,7 @@ entry: define amdgpu_kernel void @test_umulhi16_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; SI-LABEL: test_umulhi16_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_and_b32 s2, s2, 0xffff @@ -645,7 +647,7 @@ define amdgpu_kernel void @test_umulhi16_i32(ptr addrspace(1) %out, i32 %a, i32 ; ; VI-LABEL: test_umulhi16_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -661,14 +663,14 @@ define amdgpu_kernel void @test_umulhi16_i32(ptr addrspace(1) %out, i32 %a, i32 ; ; GFX9-LABEL: test_umulhi16_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s2, s2, 0xffff -; GFX9-NEXT: s_and_b32 s3, s3, 0xffff -; GFX9-NEXT: s_mul_i32 s2, s2, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_short_d16_hi v0, v1, s[0:1] +; GFX9-NEXT: s_and_b32 s0, s6, 0xffff +; GFX9-NEXT: s_and_b32 s1, s7, 0xffff +; GFX9-NEXT: s_mul_i32 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_short_d16_hi v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm entry: %a.16 = and i32 %a, 65535 @@ -683,27 +685,27 @@ entry: define amdgpu_kernel void @test_umul24_i33(ptr addrspace(1) %out, i33 %a, i33 %b) { ; SI-LABEL: test_umul24_i33: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dword s5, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dword s0, s[0:1], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s6, s4, 0xffffff -; SI-NEXT: s_and_b32 s7, s5, 0xffffff -; SI-NEXT: v_mov_b32_e32 v0, s5 -; SI-NEXT: v_mul_hi_u32_u24_e32 v0, s4, v0 -; SI-NEXT: s_mul_i32 s6, s6, s7 +; SI-NEXT: s_and_b32 s1, s2, 0xffffff +; SI-NEXT: s_and_b32 s3, s0, 0xffffff +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mul_hi_u32_u24_e32 v0, s2, v0 +; SI-NEXT: s_mul_i32 s1, s1, s3 ; SI-NEXT: v_and_b32_e32 v1, 1, v0 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_umul24_i33: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x34 -; VI-NEXT: s_load_dword s5, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x34 +; VI-NEXT: s_load_dword s5, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -716,20 +718,20 @@ define amdgpu_kernel void @test_umul24_i33(ptr addrspace(1) %out, i33 %a, i33 %b ; ; GFX9-LABEL: test_umul24_i33: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dword s5, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s3, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s4, s4, 0xffffff -; GFX9-NEXT: s_and_b32 s5, s5, 0xffffff -; GFX9-NEXT: s_mul_i32 s6, s4, s5 -; GFX9-NEXT: s_mul_hi_u32 s4, s4, s5 -; GFX9-NEXT: s_and_b32 s4, s4, 1 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: s_and_b32 s0, s2, 0xffffff +; GFX9-NEXT: s_and_b32 s1, s3, 0xffffff +; GFX9-NEXT: s_mul_i32 s2, s0, s1 +; GFX9-NEXT: s_mul_hi_u32 s0, s0, s1 +; GFX9-NEXT: s_and_b32 s0, s0, 1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm entry: %tmp0 = shl i33 %a, 9 @@ -745,9 +747,9 @@ entry: define amdgpu_kernel void @test_umulhi24_i33(ptr addrspace(1) %out, i33 %a, i33 %b) { ; SI-LABEL: test_umulhi24_i33: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xd -; SI-NEXT: s_load_dword s5, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xd +; SI-NEXT: s_load_dword s5, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -759,9 +761,9 @@ define amdgpu_kernel void @test_umulhi24_i33(ptr addrspace(1) %out, i33 %a, i33 ; ; VI-LABEL: test_umulhi24_i33: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x34 -; VI-NEXT: s_load_dword s5, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x34 +; VI-NEXT: s_load_dword s5, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -773,18 +775,18 @@ define amdgpu_kernel void @test_umulhi24_i33(ptr addrspace(1) %out, i33 %a, i33 ; ; GFX9-LABEL: test_umulhi24_i33: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dword s5, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s3, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s4, s4, 0xffffff -; GFX9-NEXT: s_and_b32 s5, s5, 0xffffff -; GFX9-NEXT: s_mul_hi_u32 s4, s4, s5 -; GFX9-NEXT: s_and_b32 s4, s4, 1 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_and_b32 s0, s2, 0xffffff +; GFX9-NEXT: s_and_b32 s1, s3, 0xffffff +; GFX9-NEXT: s_mul_hi_u32 s0, s0, s1 +; GFX9-NEXT: s_and_b32 s0, s0, 1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm entry: %tmp0 = shl i33 %a, 9 diff --git a/llvm/test/CodeGen/AMDGPU/or.ll b/llvm/test/CodeGen/AMDGPU/or.ll index eff80236d9866..0473f803bfb30 100644 --- a/llvm/test/CodeGen/AMDGPU/or.ll +++ b/llvm/test/CodeGen/AMDGPU/or.ll @@ -6,7 +6,7 @@ define amdgpu_kernel void @or_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: or_v2i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -25,7 +25,7 @@ define amdgpu_kernel void @or_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; GFX8-LABEL: or_v2i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_mov_b32 s10, s6 @@ -70,7 +70,7 @@ define amdgpu_kernel void @or_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) define amdgpu_kernel void @or_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: or_v4i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -92,7 +92,7 @@ define amdgpu_kernel void @or_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; GFX8-LABEL: or_v4i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_mov_b32 s10, s6 @@ -143,7 +143,7 @@ define amdgpu_kernel void @or_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) define amdgpu_kernel void @scalar_or_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; GFX6-LABEL: scalar_or_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -156,7 +156,7 @@ define amdgpu_kernel void @scalar_or_i32(ptr addrspace(1) %out, i32 %a, i32 %b) ; ; GFX8-LABEL: scalar_or_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -185,40 +185,40 @@ define amdgpu_kernel void @scalar_or_i32(ptr addrspace(1) %out, i32 %a, i32 %b) define amdgpu_kernel void @vector_or_i32(ptr addrspace(1) %out, ptr addrspace(1) %a, i32 %b) { ; GFX6-LABEL: vector_or_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX6-NEXT: s_load_dword s12, s[2:3], 0xd -; GFX6-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NEXT: s_mov_b32 s10, -1 -; GFX6-NEXT: s_mov_b32 s2, s10 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s12, s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s10, s2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s6 -; GFX6-NEXT: s_mov_b32 s1, s7 -; GFX6-NEXT: s_mov_b32 s3, s11 -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; GFX6-NEXT: s_mov_b32 s8, s4 -; GFX6-NEXT: s_mov_b32 s9, s5 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s11, s3 +; GFX6-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_or_b32_e32 v0, s12, v0 -; GFX6-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: vector_or_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dword s12, s[2:3], 0x34 -; GFX8-NEXT: s_mov_b32 s11, 0xf000 -; GFX8-NEXT: s_mov_b32 s10, -1 -; GFX8-NEXT: s_mov_b32 s2, s10 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dword s12, s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: s_mov_b32 s10, s2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s0, s6 -; GFX8-NEXT: s_mov_b32 s1, s7 -; GFX8-NEXT: s_mov_b32 s3, s11 -; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; GFX8-NEXT: s_mov_b32 s8, s4 -; GFX8-NEXT: s_mov_b32 s9, s5 +; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: s_mov_b32 s9, s7 +; GFX8-NEXT: s_mov_b32 s11, s3 +; GFX8-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GFX8-NEXT: s_mov_b32 s0, s4 +; GFX8-NEXT: s_mov_b32 s1, s5 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_or_b32_e32 v0, s12, v0 -; GFX8-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; EG-LABEL: vector_or_i32: @@ -246,8 +246,8 @@ define amdgpu_kernel void @vector_or_i32(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @scalar_or_literal_i32(ptr addrspace(1) %out, i32 %a) { ; GFX6-LABEL: scalar_or_literal_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -258,8 +258,8 @@ define amdgpu_kernel void @scalar_or_literal_i32(ptr addrspace(1) %out, i32 %a) ; ; GFX8-LABEL: scalar_or_literal_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -286,8 +286,8 @@ define amdgpu_kernel void @scalar_or_literal_i32(ptr addrspace(1) %out, i32 %a) define amdgpu_kernel void @scalar_or_literal_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) { ; GFX6-LABEL: scalar_or_literal_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -300,8 +300,8 @@ define amdgpu_kernel void @scalar_or_literal_i64(ptr addrspace(1) %out, [8 x i32 ; ; GFX8-LABEL: scalar_or_literal_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x4c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -332,43 +332,43 @@ define amdgpu_kernel void @scalar_or_literal_i64(ptr addrspace(1) %out, [8 x i32 define amdgpu_kernel void @scalar_or_literal_multi_use_i64(ptr addrspace(1) %out, [8 x i32], i64 %a, [8 x i32], i64 %b) { ; GFX6-LABEL: scalar_or_literal_multi_use_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x1d +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x1d ; GFX6-NEXT: s_movk_i32 s8, 0x3039 ; GFX6-NEXT: s_mov_b32 s9, 0xf237b -; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX6-NEXT: s_add_u32 s0, s6, 0x3039 -; GFX6-NEXT: s_addc_u32 s1, s7, 0xf237b +; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: s_add_u32 s0, s0, 0x3039 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX6-NEXT: s_addc_u32 s1, s1, 0xf237b ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: scalar_or_literal_multi_use_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x74 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x74 ; GFX8-NEXT: s_movk_i32 s8, 0x3039 ; GFX8-NEXT: s_mov_b32 s9, 0xf237b ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] -; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: s_mov_b32 s6, -1 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: s_add_u32 s0, s2, 0x3039 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_add_u32 s0, s0, 0x3039 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; GFX8-NEXT: s_addc_u32 s1, s3, 0xf237b +; GFX8-NEXT: s_addc_u32 s1, s1, 0xf237b ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -408,8 +408,8 @@ define amdgpu_kernel void @scalar_or_literal_multi_use_i64(ptr addrspace(1) %out define amdgpu_kernel void @scalar_or_inline_imm_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) { ; GFX6-LABEL: scalar_or_inline_imm_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -421,8 +421,8 @@ define amdgpu_kernel void @scalar_or_inline_imm_i64(ptr addrspace(1) %out, [8 x ; ; GFX8-LABEL: scalar_or_inline_imm_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x4c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -451,44 +451,44 @@ define amdgpu_kernel void @scalar_or_inline_imm_i64(ptr addrspace(1) %out, [8 x define amdgpu_kernel void @scalar_or_inline_imm_multi_use_i64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX6-LABEL: scalar_or_inline_imm_multi_use_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX6-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NEXT: s_mov_b32 s10, -1 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_or_b32 s2, s6, 63 -; GFX6-NEXT: s_mov_b32 s8, s4 -; GFX6-NEXT: s_mov_b32 s9, s5 -; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_or_b32 s4, s6, 63 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: s_add_u32 s0, s0, 63 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 -; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX6-NEXT: s_add_u32 s0, s8, 63 +; GFX6-NEXT: s_addc_u32 s1, s9, 0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: scalar_or_inline_imm_multi_use_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX8-NEXT: s_mov_b32 s11, 0xf000 -; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_or_b32 s2, s6, 63 -; GFX8-NEXT: s_mov_b32 s8, s4 -; GFX8-NEXT: s_mov_b32 s9, s5 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: s_mov_b32 s0, s4 +; GFX8-NEXT: s_or_b32 s4, s6, 63 +; GFX8-NEXT: s_mov_b32 s1, s5 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: s_add_u32 s0, s0, 63 -; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 -; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX8-NEXT: s_add_u32 s0, s8, 63 +; GFX8-NEXT: s_addc_u32 s1, s9, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_endpgm ; @@ -521,8 +521,8 @@ define amdgpu_kernel void @scalar_or_inline_imm_multi_use_i64(ptr addrspace(1) % define amdgpu_kernel void @scalar_or_neg_inline_imm_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) { ; GFX6-LABEL: scalar_or_neg_inline_imm_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0x13 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x13 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mov_b32_e32 v1, -1 @@ -534,8 +534,8 @@ define amdgpu_kernel void @scalar_or_neg_inline_imm_i64(ptr addrspace(1) %out, [ ; ; GFX8-LABEL: scalar_or_neg_inline_imm_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[2:3], 0x4c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dword s4, s[0:1], 0x4c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: v_mov_b32_e32 v1, -1 @@ -565,7 +565,7 @@ define amdgpu_kernel void @scalar_or_neg_inline_imm_i64(ptr addrspace(1) %out, [ define amdgpu_kernel void @vector_or_literal_i32(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { ; GFX6-LABEL: vector_or_literal_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -583,7 +583,7 @@ define amdgpu_kernel void @vector_or_literal_i32(ptr addrspace(1) %out, ptr addr ; ; GFX8-LABEL: vector_or_literal_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_mov_b32 s10, s6 @@ -624,7 +624,7 @@ define amdgpu_kernel void @vector_or_literal_i32(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @vector_or_inline_immediate_i32(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { ; GFX6-LABEL: vector_or_inline_immediate_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -642,7 +642,7 @@ define amdgpu_kernel void @vector_or_inline_immediate_i32(ptr addrspace(1) %out, ; ; GFX8-LABEL: vector_or_inline_immediate_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_mov_b32 s10, s6 @@ -683,8 +683,8 @@ define amdgpu_kernel void @vector_or_inline_immediate_i32(ptr addrspace(1) %out, define amdgpu_kernel void @scalar_or_i64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX6-LABEL: scalar_or_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -698,8 +698,8 @@ define amdgpu_kernel void @scalar_or_i64(ptr addrspace(1) %out, i64 %a, i64 %b) ; ; GFX8-LABEL: scalar_or_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -730,48 +730,48 @@ define amdgpu_kernel void @scalar_or_i64(ptr addrspace(1) %out, i64 %a, i64 %b) define amdgpu_kernel void @vector_or_i64(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { ; GFX6-LABEL: vector_or_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX6-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NEXT: s_mov_b32 s10, -1 -; GFX6-NEXT: s_mov_b32 s2, s10 -; GFX6-NEXT: s_mov_b32 s3, s11 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s10, s2 +; GFX6-NEXT: s_mov_b32 s11, s3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s12, s6 ; GFX6-NEXT: s_mov_b32 s13, s7 -; GFX6-NEXT: s_mov_b32 s14, s10 -; GFX6-NEXT: s_mov_b32 s15, s11 -; GFX6-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; GFX6-NEXT: s_mov_b32 s14, s2 +; GFX6-NEXT: s_mov_b32 s15, s3 +; GFX6-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ; GFX6-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 -; GFX6-NEXT: s_mov_b32 s8, s4 -; GFX6-NEXT: s_mov_b32 s9, s5 +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: vector_or_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX8-NEXT: s_mov_b32 s11, 0xf000 -; GFX8-NEXT: s_mov_b32 s10, -1 -; GFX8-NEXT: s_mov_b32 s2, s10 -; GFX8-NEXT: s_mov_b32 s3, s11 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: s_mov_b32 s10, s2 +; GFX8-NEXT: s_mov_b32 s11, s3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s12, s6 ; GFX8-NEXT: s_mov_b32 s13, s7 -; GFX8-NEXT: s_mov_b32 s14, s10 -; GFX8-NEXT: s_mov_b32 s15, s11 -; GFX8-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; GFX8-NEXT: s_mov_b32 s14, s2 +; GFX8-NEXT: s_mov_b32 s15, s3 +; GFX8-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ; GFX8-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 -; GFX8-NEXT: s_mov_b32 s8, s4 -; GFX8-NEXT: s_mov_b32 s9, s5 +; GFX8-NEXT: s_mov_b32 s0, s4 +; GFX8-NEXT: s_mov_b32 s1, s5 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; EG-LABEL: vector_or_i64: @@ -803,42 +803,42 @@ define amdgpu_kernel void @vector_or_i64(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @scalar_vector_or_i64(ptr addrspace(1) %out, ptr addrspace(1) %a, i64 %b) { ; GFX6-LABEL: scalar_vector_or_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0xd -; GFX6-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NEXT: s_mov_b32 s10, -1 -; GFX6-NEXT: s_mov_b32 s2, s10 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s10, s2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s6 -; GFX6-NEXT: s_mov_b32 s1, s7 -; GFX6-NEXT: s_mov_b32 s3, s11 -; GFX6-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 -; GFX6-NEXT: s_mov_b32 s8, s4 -; GFX6-NEXT: s_mov_b32 s9, s5 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s11, s3 +; GFX6-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_or_b32_e32 v0, s12, v0 ; GFX6-NEXT: v_or_b32_e32 v1, s13, v1 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: scalar_vector_or_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x34 -; GFX8-NEXT: s_mov_b32 s11, 0xf000 -; GFX8-NEXT: s_mov_b32 s10, -1 -; GFX8-NEXT: s_mov_b32 s2, s10 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: s_mov_b32 s10, s2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s0, s6 -; GFX8-NEXT: s_mov_b32 s1, s7 -; GFX8-NEXT: s_mov_b32 s3, s11 -; GFX8-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 -; GFX8-NEXT: s_mov_b32 s8, s4 -; GFX8-NEXT: s_mov_b32 s9, s5 +; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: s_mov_b32 s9, s7 +; GFX8-NEXT: s_mov_b32 s11, s3 +; GFX8-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; GFX8-NEXT: s_mov_b32 s0, s4 +; GFX8-NEXT: s_mov_b32 s1, s5 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_or_b32_e32 v0, s12, v0 ; GFX8-NEXT: v_or_b32_e32 v1, s13, v1 -; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; EG-LABEL: scalar_vector_or_i64: @@ -867,7 +867,7 @@ define amdgpu_kernel void @scalar_vector_or_i64(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @vector_or_i64_loadimm(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { ; GFX6-LABEL: vector_or_i64_loadimm: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -886,7 +886,7 @@ define amdgpu_kernel void @vector_or_i64_loadimm(ptr addrspace(1) %out, ptr addr ; ; GFX8-LABEL: vector_or_i64_loadimm: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_mov_b32 s10, s6 @@ -931,7 +931,7 @@ define amdgpu_kernel void @vector_or_i64_loadimm(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @vector_or_i64_imm(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { ; GFX6-LABEL: vector_or_i64_imm: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -949,7 +949,7 @@ define amdgpu_kernel void @vector_or_i64_imm(ptr addrspace(1) %out, ptr addrspac ; ; GFX8-LABEL: vector_or_i64_imm: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_mov_b32 s10, s6 @@ -990,7 +990,7 @@ define amdgpu_kernel void @vector_or_i64_imm(ptr addrspace(1) %out, ptr addrspac define amdgpu_kernel void @vector_or_i64_neg_inline_imm(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { ; GFX6-LABEL: vector_or_i64_neg_inline_imm: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -1009,7 +1009,7 @@ define amdgpu_kernel void @vector_or_i64_neg_inline_imm(ptr addrspace(1) %out, p ; ; GFX8-LABEL: vector_or_i64_neg_inline_imm: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_mov_b32 s10, s6 @@ -1053,7 +1053,7 @@ define amdgpu_kernel void @vector_or_i64_neg_inline_imm(ptr addrspace(1) %out, p define amdgpu_kernel void @vector_or_i64_neg_literal(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { ; GFX6-LABEL: vector_or_i64_neg_literal: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -1072,7 +1072,7 @@ define amdgpu_kernel void @vector_or_i64_neg_literal(ptr addrspace(1) %out, ptr ; ; GFX8-LABEL: vector_or_i64_neg_literal: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_mov_b32 s10, s6 @@ -1116,9 +1116,9 @@ define amdgpu_kernel void @vector_or_i64_neg_literal(ptr addrspace(1) %out, ptr define amdgpu_kernel void @trunc_i64_or_to_i32(ptr addrspace(1) %out, [8 x i32], i64 %a, [8 x i32], i64 %b) { ; GFX6-LABEL: trunc_i64_or_to_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0x13 -; GFX6-NEXT: s_load_dword s5, s[2:3], 0x1d -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x13 +; GFX6-NEXT: s_load_dword s5, s[0:1], 0x1d +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -1129,9 +1129,9 @@ define amdgpu_kernel void @trunc_i64_or_to_i32(ptr addrspace(1) %out, [8 x i32], ; ; GFX8-LABEL: trunc_i64_or_to_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[2:3], 0x4c -; GFX8-NEXT: s_load_dword s5, s[2:3], 0x74 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dword s4, s[0:1], 0x4c +; GFX8-NEXT: s_load_dword s5, s[0:1], 0x74 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1159,21 +1159,21 @@ define amdgpu_kernel void @trunc_i64_or_to_i32(ptr addrspace(1) %out, [8 x i32], define amdgpu_kernel void @or_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; GFX6-LABEL: or_i1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX6-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NEXT: s_mov_b32 s10, -1 -; GFX6-NEXT: s_mov_b32 s2, s10 -; GFX6-NEXT: s_mov_b32 s3, s11 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s10, s2 +; GFX6-NEXT: s_mov_b32 s11, s3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s12, s6 ; GFX6-NEXT: s_mov_b32 s13, s7 -; GFX6-NEXT: s_mov_b32 s14, s10 -; GFX6-NEXT: s_mov_b32 s15, s11 -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_mov_b32 s14, s2 +; GFX6-NEXT: s_mov_b32 s15, s3 +; GFX6-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; GFX6-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; GFX6-NEXT: s_mov_b32 s8, s4 -; GFX6-NEXT: s_mov_b32 s9, s5 +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 ; GFX6-NEXT: s_waitcnt vmcnt(1) ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1181,26 +1181,26 @@ define amdgpu_kernel void @or_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, p ; GFX6-NEXT: v_max_f32_e32 v0, v1, v0 ; GFX6-NEXT: v_cmp_le_f32_e32 vcc, 0, v0 ; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX6-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: or_i1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX8-NEXT: s_mov_b32 s11, 0xf000 -; GFX8-NEXT: s_mov_b32 s10, -1 -; GFX8-NEXT: s_mov_b32 s2, s10 -; GFX8-NEXT: s_mov_b32 s3, s11 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: s_mov_b32 s10, s2 +; GFX8-NEXT: s_mov_b32 s11, s3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s12, s6 ; GFX8-NEXT: s_mov_b32 s13, s7 -; GFX8-NEXT: s_mov_b32 s14, s10 -; GFX8-NEXT: s_mov_b32 s15, s11 -; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GFX8-NEXT: s_mov_b32 s14, s2 +; GFX8-NEXT: s_mov_b32 s15, s3 +; GFX8-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; GFX8-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; GFX8-NEXT: s_mov_b32 s8, s4 -; GFX8-NEXT: s_mov_b32 s9, s5 +; GFX8-NEXT: s_mov_b32 s0, s4 +; GFX8-NEXT: s_mov_b32 s1, s5 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -1208,7 +1208,7 @@ define amdgpu_kernel void @or_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, p ; GFX8-NEXT: v_max_f32_e32 v0, v1, v0 ; GFX8-NEXT: v_cmp_le_f32_e32 vcc, 0, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX8-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; EG-LABEL: or_i1: @@ -1244,8 +1244,8 @@ define amdgpu_kernel void @or_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, p define amdgpu_kernel void @s_or_i1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c, i32 %d) { ; GFX6-LABEL: s_or_i1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -1260,8 +1260,8 @@ define amdgpu_kernel void @s_or_i1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c ; ; GFX8-LABEL: s_or_i1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll b/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll index c72a7ba3eee83..79082a54c6a36 100644 --- a/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll +++ b/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll @@ -4,17 +4,17 @@ define amdgpu_kernel void @fma_vector_vector_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 { ; GCN-LABEL: fma_vector_vector_scalar_lo: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NEXT: ds_read_b32 v2, v0 ; GCN-NEXT: ds_read_b32 v0, v0 offset:4 ; GCN-NEXT: ds_read_u16 v1, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] -; GCN-NEXT: global_store_dword v3, v0, s[0:1] +; GCN-NEXT: global_store_dword v3, v0, s[4:5] ; GCN-NEXT: s_endpgm bb: %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1 @@ -35,17 +35,17 @@ bb: define amdgpu_kernel void @fma_vector_vector_neg_broadcast_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 { ; GCN-LABEL: fma_vector_vector_neg_broadcast_scalar_lo: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NEXT: ds_read_b32 v2, v0 ; GCN-NEXT: ds_read_b32 v0, v0 offset:4 ; GCN-NEXT: ds_read_u16 v1, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1] -; GCN-NEXT: global_store_dword v3, v0, s[0:1] +; GCN-NEXT: global_store_dword v3, v0, s[4:5] ; GCN-NEXT: s_endpgm bb: %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1 @@ -67,17 +67,17 @@ bb: define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 { ; GCN-LABEL: fma_vector_vector_neg_scalar_lo: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NEXT: ds_read_b32 v2, v0 ; GCN-NEXT: ds_read_b32 v0, v0 offset:4 ; GCN-NEXT: ds_read_u16 v1, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1] -; GCN-NEXT: global_store_dword v3, v0, s[0:1] +; GCN-NEXT: global_store_dword v3, v0, s[4:5] ; GCN-NEXT: s_endpgm bb: %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1 @@ -99,17 +99,17 @@ bb: define amdgpu_kernel void @fma_vector_vector_neg_broadcast_neg_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 { ; GCN-LABEL: fma_vector_vector_neg_broadcast_neg_scalar_lo: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NEXT: ds_read_b32 v2, v0 ; GCN-NEXT: ds_read_b32 v0, v0 offset:4 ; GCN-NEXT: ds_read_u16 v1, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] -; GCN-NEXT: global_store_dword v3, v0, s[0:1] +; GCN-NEXT: global_store_dword v3, v0, s[4:5] ; GCN-NEXT: s_endpgm bb: %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1 @@ -132,17 +132,17 @@ bb: define amdgpu_kernel void @fma_vector_vector_scalar_neg_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 { ; GCN-LABEL: fma_vector_vector_scalar_neg_lo: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NEXT: ds_read_b32 v2, v0 ; GCN-NEXT: ds_read_b32 v0, v0 offset:4 ; GCN-NEXT: ds_read_u16 v1, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] neg_lo:[0,0,1] -; GCN-NEXT: global_store_dword v3, v0, s[0:1] +; GCN-NEXT: global_store_dword v3, v0, s[4:5] ; GCN-NEXT: s_endpgm bb: %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1 @@ -163,17 +163,17 @@ bb: define amdgpu_kernel void @fma_vector_vector_scalar_neg_hi(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 { ; GCN-LABEL: fma_vector_vector_scalar_neg_hi: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NEXT: ds_read_b32 v2, v0 ; GCN-NEXT: ds_read_b32 v0, v0 offset:4 ; GCN-NEXT: ds_read_u16 v1, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] neg_hi:[0,0,1] -; GCN-NEXT: global_store_dword v3, v0, s[0:1] +; GCN-NEXT: global_store_dword v3, v0, s[4:5] ; GCN-NEXT: s_endpgm bb: %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1 @@ -194,16 +194,16 @@ bb: define amdgpu_kernel void @add_vector_neg_bitcast_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 { ; GCN-LABEL: add_vector_neg_bitcast_scalar_lo: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NEXT: ds_read_b32 v0, v0 ; GCN-NEXT: ds_read_u16 v1, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_pk_add_u16 v0, v0, v1 op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1] -; GCN-NEXT: global_store_dword v2, v0, s[0:1] +; GCN-NEXT: global_store_dword v2, v0, s[4:5] ; GCN-NEXT: s_endpgm bb: %vec0 = load volatile <2 x i16>, ptr addrspace(3) %lds, align 4 @@ -222,11 +222,11 @@ bb: define amdgpu_kernel void @fma_vector_vector_scalar_lo_neg_scalar_hi(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 { ; GCN-LABEL: fma_vector_vector_scalar_lo_neg_scalar_hi: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NEXT: ds_read_b32 v2, v0 ; GCN-NEXT: ds_read_b32 v0, v0 offset:4 ; GCN-NEXT: ds_read_u16 v3, v1 @@ -237,7 +237,7 @@ define amdgpu_kernel void @fma_vector_vector_scalar_lo_neg_scalar_hi(ptr addrspa ; GCN-NEXT: v_xor_b32_e32 v1, 0x8000, v1 ; GCN-NEXT: v_lshl_or_b32 v1, v1, 16, v3 ; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v1 -; GCN-NEXT: global_store_dword v4, v0, s[0:1] +; GCN-NEXT: global_store_dword v4, v0, s[4:5] ; GCN-NEXT: s_endpgm bb: %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1 @@ -261,10 +261,10 @@ bb: define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo_scalar_hi(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 { ; GCN-LABEL: fma_vector_vector_neg_scalar_lo_scalar_hi: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NEXT: ds_read_b32 v2, v0 ; GCN-NEXT: ds_read_b32 v0, v0 offset:4 ; GCN-NEXT: ds_read_u16 v3, v1 @@ -273,7 +273,7 @@ define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo_scalar_hi(ptr addrspa ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v3 neg_lo:[0,0,1] neg_hi:[0,0,1] -; GCN-NEXT: global_store_dword v1, v0, s[0:1] +; GCN-NEXT: global_store_dword v1, v0, s[4:5] ; GCN-NEXT: s_endpgm bb: %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1 diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll index a547c258e3921..e076df97e1ba4 100644 --- a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll +++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll @@ -1,18 +1,14 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NO-PRELOAD %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-PRELOAD-1 %s ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=2 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-PRELOAD-2 %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=4 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-PRELOAD-4 %s ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=8 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-PRELOAD-8 %s ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-NO-PRELOAD %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-PRELOAD-1 %s ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=2 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-PRELOAD-2 %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=4 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-PRELOAD-4 %s ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=8 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-PRELOAD-8 %s -define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) %out, i8 %arg0) #0 { -; GFX940-NO-PRELOAD-LABEL: ptr1_i8: +define amdgpu_kernel void @ptr1_i8_kernel_preload_arg(ptr addrspace(1) %out, i8 %arg0) #0 { +; GFX940-NO-PRELOAD-LABEL: ptr1_i8_kernel_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -23,51 +19,27 @@ define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) %out, i8 %arg0) #0 { ; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-1-LABEL: ptr1_i8: -; GFX940-PRELOAD-1: ; %bb.0: -; GFX940-PRELOAD-1-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-1-NEXT: s_and_b32 s0, s4, 0xff -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s0 -; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-1-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: ptr1_i8: -; GFX940-PRELOAD-2: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-2-LABEL: ptr1_i8_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: s_and_b32 s0, s4, 0xff +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 ; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-4-LABEL: ptr1_i8: -; GFX940-PRELOAD-4: ; %bb.0: -; GFX940-PRELOAD-4-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-4-NEXT: s_and_b32 s0, s4, 0xff -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s0 -; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-4-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: ptr1_i8: -; GFX940-PRELOAD-8: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-8-LABEL: ptr1_i8_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: s_and_b32 s0, s4, 0xff +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 ; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: ptr1_i8: +; GFX90a-NO-PRELOAD-LABEL: ptr1_i8_kernel_preload_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -78,56 +50,32 @@ define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) %out, i8 %arg0) #0 { ; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-1-LABEL: ptr1_i8: -; GFX90a-PRELOAD-1: ; %bb.0: -; GFX90a-PRELOAD-1-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-1-NEXT: s_and_b32 s2, s2, 0xff -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v1, s[0:1] -; GFX90a-PRELOAD-1-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: ptr1_i8: -; GFX90a-PRELOAD-2: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-PRELOAD-2-LABEL: ptr1_i8_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_and_b32 s0, s8, 0xff ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-2-NEXT: s_and_b32 s2, s2, 0xff -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 +; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-4-LABEL: ptr1_i8: -; GFX90a-PRELOAD-4: ; %bb.0: -; GFX90a-PRELOAD-4-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-4-NEXT: s_and_b32 s2, s2, 0xff -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v1, s[0:1] -; GFX90a-PRELOAD-4-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: ptr1_i8: -; GFX90a-PRELOAD-8: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-PRELOAD-8-LABEL: ptr1_i8_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_and_b32 s0, s8, 0xff ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-8-NEXT: s_and_b32 s2, s2, 0xff -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 +; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7] ; GFX90a-PRELOAD-8-NEXT: s_endpgm %ext = zext i8 %arg0 to i32 store i32 %ext, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) %out, i8 zeroext %arg0) #0 { -; GFX940-NO-PRELOAD-LABEL: ptr1_i8_zext_arg: +define amdgpu_kernel void @ptr1_i8_zext_kernel_preload_arg(ptr addrspace(1) %out, i8 zeroext %arg0) #0 { +; GFX940-NO-PRELOAD-LABEL: ptr1_i8_zext_kernel_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -138,51 +86,29 @@ define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) %out, i8 zeroext %a ; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-1-LABEL: ptr1_i8_zext_arg: -; GFX940-PRELOAD-1: ; %bb.0: -; GFX940-PRELOAD-1-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-1-NEXT: s_and_b32 s0, s4, 0xff -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s0 -; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-1-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: ptr1_i8_zext_arg: -; GFX940-PRELOAD-2: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-2-LABEL: ptr1_i8_zext_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_mov_b32 s0, 0xffff +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: s_and_b32 s0, s4, 0xff -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-PRELOAD-2-NEXT: v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-4-LABEL: ptr1_i8_zext_arg: -; GFX940-PRELOAD-4: ; %bb.0: -; GFX940-PRELOAD-4-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-4-NEXT: s_and_b32 s0, s4, 0xff -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s0 -; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-4-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: ptr1_i8_zext_arg: -; GFX940-PRELOAD-8: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-8-LABEL: ptr1_i8_zext_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_mov_b32 s0, 0xffff +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-8-NEXT: s_and_b32 s0, s4, 0xff -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-PRELOAD-8-NEXT: v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: ptr1_i8_zext_arg: +; GFX90a-NO-PRELOAD-LABEL: ptr1_i8_zext_kernel_preload_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -193,56 +119,34 @@ define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) %out, i8 zeroext %a ; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-1-LABEL: ptr1_i8_zext_arg: -; GFX90a-PRELOAD-1: ; %bb.0: -; GFX90a-PRELOAD-1-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-1-NEXT: s_and_b32 s2, s2, 0xff -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v1, s[0:1] -; GFX90a-PRELOAD-1-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: ptr1_i8_zext_arg: -; GFX90a-PRELOAD-2: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-PRELOAD-2-LABEL: ptr1_i8_zext_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_mov_b32 s0, 0xffff +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-2-NEXT: s_and_b32 s2, s2, 0xff -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90a-PRELOAD-2-NEXT: v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-4-LABEL: ptr1_i8_zext_arg: -; GFX90a-PRELOAD-4: ; %bb.0: -; GFX90a-PRELOAD-4-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-4-NEXT: s_and_b32 s2, s2, 0xff -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v1, s[0:1] -; GFX90a-PRELOAD-4-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: ptr1_i8_zext_arg: -; GFX90a-PRELOAD-8: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-PRELOAD-8-LABEL: ptr1_i8_zext_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_mov_b32 s0, 0xffff +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s8 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-8-NEXT: s_and_b32 s2, s2, 0xff -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90a-PRELOAD-8-NEXT: v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7] ; GFX90a-PRELOAD-8-NEXT: s_endpgm %ext = zext i8 %arg0 to i32 store i32 %ext, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0) #0 { -; GFX940-NO-PRELOAD-LABEL: ptr1_i16_preload_arg: +define amdgpu_kernel void @ptr1_i16_kernel_preload_arg(ptr addrspace(1) %out, i16 %arg0) #0 { +; GFX940-NO-PRELOAD-LABEL: ptr1_i16_kernel_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -253,51 +157,27 @@ define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0 ; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-1-LABEL: ptr1_i16_preload_arg: -; GFX940-PRELOAD-1: ; %bb.0: -; GFX940-PRELOAD-1-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-1-NEXT: s_and_b32 s0, s4, 0xffff -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s0 -; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-1-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: ptr1_i16_preload_arg: -; GFX940-PRELOAD-2: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-2-LABEL: ptr1_i16_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: s_and_b32 s0, s4, 0xffff +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 ; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-4-LABEL: ptr1_i16_preload_arg: -; GFX940-PRELOAD-4: ; %bb.0: -; GFX940-PRELOAD-4-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-4-NEXT: s_and_b32 s0, s4, 0xffff -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s0 -; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-4-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: ptr1_i16_preload_arg: -; GFX940-PRELOAD-8: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-8-LABEL: ptr1_i16_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: s_and_b32 s0, s4, 0xffff +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 ; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: ptr1_i16_preload_arg: +; GFX90a-NO-PRELOAD-LABEL: ptr1_i16_kernel_preload_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -308,56 +188,32 @@ define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0 ; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-1-LABEL: ptr1_i16_preload_arg: -; GFX90a-PRELOAD-1: ; %bb.0: -; GFX90a-PRELOAD-1-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-1-NEXT: s_and_b32 s2, s2, 0xffff -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v1, s[0:1] -; GFX90a-PRELOAD-1-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: ptr1_i16_preload_arg: -; GFX90a-PRELOAD-2: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-PRELOAD-2-LABEL: ptr1_i16_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_and_b32 s0, s8, 0xffff ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-2-NEXT: s_and_b32 s2, s2, 0xffff -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 +; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-4-LABEL: ptr1_i16_preload_arg: -; GFX90a-PRELOAD-4: ; %bb.0: -; GFX90a-PRELOAD-4-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-4-NEXT: s_and_b32 s2, s2, 0xffff -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v1, s[0:1] -; GFX90a-PRELOAD-4-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: ptr1_i16_preload_arg: -; GFX90a-PRELOAD-8: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-PRELOAD-8-LABEL: ptr1_i16_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_and_b32 s0, s8, 0xffff ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-8-NEXT: s_and_b32 s2, s2, 0xffff -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 +; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7] ; GFX90a-PRELOAD-8-NEXT: s_endpgm %ext = zext i16 %arg0 to i32 store i32 %ext, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) %out, i32 %arg0) #0 { -; GFX940-NO-PRELOAD-LABEL: ptr1_i32_preload_arg: +define amdgpu_kernel void @ptr1_i32_kernel_preload_arg(ptr addrspace(1) %out, i32 %arg0) #0 { +; GFX940-NO-PRELOAD-LABEL: ptr1_i32_kernel_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -367,47 +223,25 @@ define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) %out, i32 %arg0 ; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-1-LABEL: ptr1_i32_preload_arg: -; GFX940-PRELOAD-1: ; %bb.0: -; GFX940-PRELOAD-1-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-1-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: ptr1_i32_preload_arg: -; GFX940-PRELOAD-2: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-2-LABEL: ptr1_i32_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4 ; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-4-LABEL: ptr1_i32_preload_arg: -; GFX940-PRELOAD-4: ; %bb.0: -; GFX940-PRELOAD-4-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-4-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: ptr1_i32_preload_arg: -; GFX940-PRELOAD-8: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-8-LABEL: ptr1_i32_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4 ; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: ptr1_i32_preload_arg: +; GFX90a-NO-PRELOAD-LABEL: ptr1_i32_kernel_preload_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -417,52 +251,30 @@ define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) %out, i32 %arg0 ; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-1-LABEL: ptr1_i32_preload_arg: -; GFX90a-PRELOAD-1: ; %bb.0: -; GFX90a-PRELOAD-1-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v1, s[0:1] -; GFX90a-PRELOAD-1-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: ptr1_i32_preload_arg: -; GFX90a-PRELOAD-2: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-PRELOAD-2-LABEL: ptr1_i32_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8 +; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-4-LABEL: ptr1_i32_preload_arg: -; GFX90a-PRELOAD-4: ; %bb.0: -; GFX90a-PRELOAD-4-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v1, s[0:1] -; GFX90a-PRELOAD-4-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: ptr1_i32_preload_arg: -; GFX90a-PRELOAD-8: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-PRELOAD-8-LABEL: ptr1_i32_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s8 +; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7] ; GFX90a-PRELOAD-8-NEXT: s_endpgm store i32 %arg0, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 %arg0, ptr addrspace(1) %out, i32 %arg1) #0 { -; GFX940-NO-PRELOAD-LABEL: i32_ptr1_i32_preload_arg: +define amdgpu_kernel void @i32_ptr1_i32_kernel_preload_arg(i32 %arg0, ptr addrspace(1) %out, i32 %arg1) #0 { +; GFX940-NO-PRELOAD-LABEL: i32_ptr1_i32_kernel_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x10 ; GFX940-NO-PRELOAD-NEXT: s_load_dword s5, s[0:1], 0x0 @@ -474,55 +286,29 @@ define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 %arg0, ptr addrspace(1) ; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-1-LABEL: i32_ptr1_i32_preload_arg: -; GFX940-PRELOAD-1: ; %bb.0: -; GFX940-PRELOAD-1-NEXT: s_load_dword s4, s[0:1], 0x10 -; GFX940-PRELOAD-1-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-1-NEXT: s_add_i32 s0, s5, s4 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s0 -; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-1-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: i32_ptr1_i32_preload_arg: -; GFX940-PRELOAD-2: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dword s4, s[0:1], 0x10 -; GFX940-PRELOAD-2-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-PRELOAD-2-LABEL: i32_ptr1_i32_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dword s0, s[0:1], 0x10 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: s_add_i32 s0, s5, s4 +; GFX940-PRELOAD-2-NEXT: s_add_i32 s0, s2, s0 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 -; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-4-LABEL: i32_ptr1_i32_preload_arg: -; GFX940-PRELOAD-4: ; %bb.0: -; GFX940-PRELOAD-4-NEXT: s_load_dword s4, s[0:1], 0x10 -; GFX940-PRELOAD-4-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-4-NEXT: s_add_i32 s0, s5, s4 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s0 -; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-4-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: i32_ptr1_i32_preload_arg: -; GFX940-PRELOAD-8: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_load_dword s4, s[0:1], 0x10 -; GFX940-PRELOAD-8-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-PRELOAD-8-LABEL: i32_ptr1_i32_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_add_i32 s0, s2, s6 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-8-NEXT: s_add_i32 s0, s5, s4 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 -; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: i32_ptr1_i32_preload_arg: +; GFX90a-NO-PRELOAD-LABEL: i32_ptr1_i32_kernel_preload_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x10 ; GFX90a-NO-PRELOAD-NEXT: s_load_dword s3, s[4:5], 0x0 @@ -534,60 +320,34 @@ define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 %arg0, ptr addrspace(1) ; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-1-LABEL: i32_ptr1_i32_preload_arg: -; GFX90a-PRELOAD-1: ; %bb.0: -; GFX90a-PRELOAD-1-NEXT: s_load_dword s2, s[4:5], 0x10 -; GFX90a-PRELOAD-1-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-1-NEXT: s_add_i32 s2, s3, s2 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v1, s[0:1] -; GFX90a-PRELOAD-1-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: i32_ptr1_i32_preload_arg: -; GFX90a-PRELOAD-2: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x10 -; GFX90a-PRELOAD-2-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90a-PRELOAD-2-LABEL: i32_ptr1_i32_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_load_dword s0, s[4:5], 0x10 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-2-NEXT: s_add_i32 s2, s3, s2 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90a-PRELOAD-2-NEXT: s_add_i32 s0, s6, s0 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 +; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[8:9] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-4-LABEL: i32_ptr1_i32_preload_arg: -; GFX90a-PRELOAD-4: ; %bb.0: -; GFX90a-PRELOAD-4-NEXT: s_load_dword s2, s[4:5], 0x10 -; GFX90a-PRELOAD-4-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-4-NEXT: s_add_i32 s2, s3, s2 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v1, s[0:1] -; GFX90a-PRELOAD-4-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: i32_ptr1_i32_preload_arg: -; GFX90a-PRELOAD-8: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_load_dword s2, s[4:5], 0x10 -; GFX90a-PRELOAD-8-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90a-PRELOAD-8-LABEL: i32_ptr1_i32_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_add_i32 s0, s6, s10 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-8-NEXT: s_add_i32 s2, s3, s2 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 +; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[8:9] ; GFX90a-PRELOAD-8-NEXT: s_endpgm %add = add i32 %arg0, %arg1 store i32 %add, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0, i16 %arg1) #0 { -; GFX940-NO-PRELOAD-LABEL: ptr1_i16_i16_preload_arg: +define amdgpu_kernel void @ptr1_i16_i16_kernel_preload_arg(ptr addrspace(1) %out, i16 %arg0, i16 %arg1) #0 { +; GFX940-NO-PRELOAD-LABEL: ptr1_i16_i16_kernel_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -600,59 +360,33 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 % ; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-1-LABEL: ptr1_i16_i16_preload_arg: -; GFX940-PRELOAD-1: ; %bb.0: -; GFX940-PRELOAD-1-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-1-NEXT: s_lshr_b32 s0, s4, 16 -; GFX940-PRELOAD-1-NEXT: s_and_b32 s1, s4, 0xffff -; GFX940-PRELOAD-1-NEXT: s_add_i32 s0, s1, s0 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s0 -; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-1-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: ptr1_i16_i16_preload_arg: -; GFX940-PRELOAD-2: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-2-LABEL: ptr1_i16_i16_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-PRELOAD-2-NEXT: s_and_b32 s1, s4, 0xffff ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 16 -; GFX940-PRELOAD-2-NEXT: s_and_b32 s1, s4, 0xffff +; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s0, 16 ; GFX940-PRELOAD-2-NEXT: s_add_i32 s0, s1, s0 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 ; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-4-LABEL: ptr1_i16_i16_preload_arg: -; GFX940-PRELOAD-4: ; %bb.0: -; GFX940-PRELOAD-4-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-4-NEXT: s_lshr_b32 s0, s4, 16 -; GFX940-PRELOAD-4-NEXT: s_and_b32 s1, s4, 0xffff -; GFX940-PRELOAD-4-NEXT: s_add_i32 s0, s1, s0 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s0 -; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-4-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: ptr1_i16_i16_preload_arg: -; GFX940-PRELOAD-8: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-8-LABEL: ptr1_i16_i16_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 16 ; GFX940-PRELOAD-8-NEXT: s_and_b32 s1, s4, 0xffff ; GFX940-PRELOAD-8-NEXT: s_add_i32 s0, s1, s0 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 ; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: ptr1_i16_i16_preload_arg: +; GFX90a-NO-PRELOAD-LABEL: ptr1_i16_i16_kernel_preload_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -665,56 +399,30 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 % ; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-1-LABEL: ptr1_i16_i16_preload_arg: -; GFX90a-PRELOAD-1: ; %bb.0: -; GFX90a-PRELOAD-1-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-1-NEXT: s_lshr_b32 s3, s2, 16 -; GFX90a-PRELOAD-1-NEXT: s_and_b32 s2, s2, 0xffff -; GFX90a-PRELOAD-1-NEXT: s_add_i32 s2, s2, s3 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v1, s[0:1] -; GFX90a-PRELOAD-1-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: ptr1_i16_i16_preload_arg: -; GFX90a-PRELOAD-2: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-PRELOAD-2-LABEL: ptr1_i16_i16_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX90a-PRELOAD-2-NEXT: s_and_b32 s1, s8, 0xffff ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s3, s2, 16 -; GFX90a-PRELOAD-2-NEXT: s_and_b32 s2, s2, 0xffff -; GFX90a-PRELOAD-2-NEXT: s_add_i32 s2, s2, s3 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s0, 16 +; GFX90a-PRELOAD-2-NEXT: s_add_i32 s0, s1, s0 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 +; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-4-LABEL: ptr1_i16_i16_preload_arg: -; GFX90a-PRELOAD-4: ; %bb.0: -; GFX90a-PRELOAD-4-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-4-NEXT: s_lshr_b32 s3, s2, 16 -; GFX90a-PRELOAD-4-NEXT: s_and_b32 s2, s2, 0xffff -; GFX90a-PRELOAD-4-NEXT: s_add_i32 s2, s2, s3 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v1, s[0:1] -; GFX90a-PRELOAD-4-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: ptr1_i16_i16_preload_arg: -; GFX90a-PRELOAD-8: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-PRELOAD-8-LABEL: ptr1_i16_i16_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 16 +; GFX90a-PRELOAD-8-NEXT: s_and_b32 s1, s8, 0xffff +; GFX90a-PRELOAD-8-NEXT: s_add_i32 s0, s1, s0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s3, s2, 16 -; GFX90a-PRELOAD-8-NEXT: s_and_b32 s2, s2, 0xffff -; GFX90a-PRELOAD-8-NEXT: s_add_i32 s2, s2, s3 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 +; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7] ; GFX90a-PRELOAD-8-NEXT: s_endpgm %ext = zext i16 %arg0 to i32 %ext1 = zext i16 %arg1 to i32 @@ -723,8 +431,8 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 % ret void } -define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) %out, <2 x i8> %in) #0 { -; GFX940-NO-PRELOAD-LABEL: ptr1_v2i8_preload_arg: +define amdgpu_kernel void @ptr1_v2i8_kernel_preload_arg(ptr addrspace(1) %out, <2 x i8> %in) #0 { +; GFX940-NO-PRELOAD-LABEL: ptr1_v2i8_kernel_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -734,47 +442,29 @@ define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) %out, <2 x i8> ; GFX940-NO-PRELOAD-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-1-LABEL: ptr1_v2i8_preload_arg: -; GFX940-PRELOAD-1: ; %bb.0: -; GFX940-PRELOAD-1-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-PRELOAD-1-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-1-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: ptr1_v2i8_preload_arg: -; GFX940-PRELOAD-2: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-PRELOAD-2-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-2-LABEL: ptr1_v2i8_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 8 +; GFX940-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-PRELOAD-2-NEXT: global_store_short v1, v0, s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-4-LABEL: ptr1_v2i8_preload_arg: -; GFX940-PRELOAD-4: ; %bb.0: -; GFX940-PRELOAD-4-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-PRELOAD-4-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-4-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: ptr1_v2i8_preload_arg: -; GFX940-PRELOAD-8: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-PRELOAD-8-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-8-LABEL: ptr1_v2i8_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 8 +; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-PRELOAD-8-NEXT: global_store_short v1, v0, s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: ptr1_v2i8_preload_arg: +; GFX90a-NO-PRELOAD-LABEL: ptr1_v2i8_kernel_preload_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -784,52 +474,34 @@ define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) %out, <2 x i8> ; GFX90a-NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-1-LABEL: ptr1_v2i8_preload_arg: -; GFX90a-PRELOAD-1: ; %bb.0: -; GFX90a-PRELOAD-1-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-1-NEXT: global_store_short v0, v1, s[0:1] -; GFX90a-PRELOAD-1-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: ptr1_v2i8_preload_arg: -; GFX90a-PRELOAD-2: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[0:1] +; GFX90a-PRELOAD-2-LABEL: ptr1_v2i8_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 8 +; GFX90a-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, 0 +; GFX90a-PRELOAD-2-NEXT: global_store_short v1, v0, s[6:7] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-4-LABEL: ptr1_v2i8_preload_arg: -; GFX90a-PRELOAD-4: ; %bb.0: -; GFX90a-PRELOAD-4-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-4-NEXT: global_store_short v0, v1, s[0:1] -; GFX90a-PRELOAD-4-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: ptr1_v2i8_preload_arg: -; GFX90a-PRELOAD-8: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-8-NEXT: global_store_short v0, v1, s[0:1] +; GFX90a-PRELOAD-8-LABEL: ptr1_v2i8_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 8 +; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, 0 +; GFX90a-PRELOAD-8-NEXT: global_store_short v1, v0, s[6:7] ; GFX90a-PRELOAD-8-NEXT: s_endpgm store <2 x i8> %in, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 %after.offset) #0 { -; GFX940-NO-PRELOAD-LABEL: byref_preload_arg: +define amdgpu_kernel void @byref_kernel_preload_arg(ptr addrspace(1) %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 %after.offset) #0 { +; GFX940-NO-PRELOAD-LABEL: byref_kernel_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x100 ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 @@ -843,63 +515,37 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspac ; GFX940-NO-PRELOAD-NEXT: s_waitcnt vmcnt(0) ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-1-LABEL: byref_preload_arg: -; GFX940-PRELOAD-1: ; %bb.0: -; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x100 -; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s2 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s3 -; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 -; GFX940-PRELOAD-1-NEXT: s_waitcnt vmcnt(0) -; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v2, s[4:5] sc0 sc1 -; GFX940-PRELOAD-1-NEXT: s_waitcnt vmcnt(0) -; GFX940-PRELOAD-1-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: byref_preload_arg: -; GFX940-PRELOAD-2: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x100 -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-PRELOAD-2-LABEL: byref_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x100 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s3 -; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_waitcnt vmcnt(0) -; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v2, s[4:5] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v2, s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_waitcnt vmcnt(0) ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-4-LABEL: byref_preload_arg: -; GFX940-PRELOAD-4: ; %bb.0: -; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x100 -; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s2 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s3 -; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 -; GFX940-PRELOAD-4-NEXT: s_waitcnt vmcnt(0) -; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v2, s[4:5] sc0 sc1 -; GFX940-PRELOAD-4-NEXT: s_waitcnt vmcnt(0) -; GFX940-PRELOAD-4-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: byref_preload_arg: -; GFX940-PRELOAD-8: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x100 -; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-PRELOAD-8-LABEL: byref_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x100 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s2 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s3 -; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_waitcnt vmcnt(0) -; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v2, s[4:5] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v2, s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_waitcnt vmcnt(0) ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: byref_preload_arg: +; GFX90a-NO-PRELOAD-LABEL: byref_kernel_preload_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100 ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 @@ -913,59 +559,33 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspac ; GFX90a-NO-PRELOAD-NEXT: s_waitcnt vmcnt(0) ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-1-LABEL: byref_preload_arg: -; GFX90a-PRELOAD-1: ; %bb.0: -; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100 -; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s1 -; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v1, s[2:3] -; GFX90a-PRELOAD-1-NEXT: s_waitcnt vmcnt(0) -; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v2, s[2:3] -; GFX90a-PRELOAD-1-NEXT: s_waitcnt vmcnt(0) -; GFX90a-PRELOAD-1-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: byref_preload_arg: -; GFX90a-PRELOAD-2: ; %bb.0: +; GFX90a-PRELOAD-2-LABEL: byref_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100 -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s1 -; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7] ; GFX90a-PRELOAD-2-NEXT: s_waitcnt vmcnt(0) -; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v2, s[2:3] +; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v2, s[6:7] ; GFX90a-PRELOAD-2-NEXT: s_waitcnt vmcnt(0) ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-4-LABEL: byref_preload_arg: -; GFX90a-PRELOAD-4: ; %bb.0: -; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100 -; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s1 -; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v1, s[2:3] -; GFX90a-PRELOAD-4-NEXT: s_waitcnt vmcnt(0) -; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v2, s[2:3] -; GFX90a-PRELOAD-4-NEXT: s_waitcnt vmcnt(0) -; GFX90a-PRELOAD-4-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: byref_preload_arg: -; GFX90a-PRELOAD-8: ; %bb.0: +; GFX90a-PRELOAD-8-LABEL: byref_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100 -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s1 -; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7] ; GFX90a-PRELOAD-8-NEXT: s_waitcnt vmcnt(0) -; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v2, s[2:3] +; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v2, s[6:7] ; GFX90a-PRELOAD-8-NEXT: s_waitcnt vmcnt(0) ; GFX90a-PRELOAD-8-NEXT: s_endpgm %in = load i32, ptr addrspace(4) %in.byref @@ -975,8 +595,8 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspac } -define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> %in) #0 { -; GFX940-NO-PRELOAD-LABEL: v8i32_arg: +define amdgpu_kernel void @v8i32_kernel_preload_arg(ptr addrspace(1) nocapture %out, <8 x i32> %in) #0 { +; GFX940-NO-PRELOAD-LABEL: v8i32_kernel_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x20 ; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v4, 0 @@ -995,83 +615,47 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> ; GFX940-NO-PRELOAD-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-1-LABEL: v8i32_arg: -; GFX940-PRELOAD-1: ; %bb.0: -; GFX940-PRELOAD-1-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x20 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s8 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s9 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s10 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v3, s11 -; GFX940-PRELOAD-1-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 sc0 sc1 -; GFX940-PRELOAD-1-NEXT: s_nop 1 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s6 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v3, s7 -; GFX940-PRELOAD-1-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 -; GFX940-PRELOAD-1-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: v8i32_arg: -; GFX940-PRELOAD-2: ; %bb.0: +; GFX940-PRELOAD-2-LABEL: v8i32_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x20 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s8 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s9 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s11 -; GFX940-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_nop 1 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s4 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s5 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s6 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s7 -; GFX940-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-4-LABEL: v8i32_arg: -; GFX940-PRELOAD-4: ; %bb.0: -; GFX940-PRELOAD-4-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x20 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s8 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s9 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s10 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v3, s11 -; GFX940-PRELOAD-4-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 sc0 sc1 -; GFX940-PRELOAD-4-NEXT: s_nop 1 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s6 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v3, s7 -; GFX940-PRELOAD-4-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 -; GFX940-PRELOAD-4-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: v8i32_arg: -; GFX940-PRELOAD-8: ; %bb.0: +; GFX940-PRELOAD-8-LABEL: v8i32_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x20 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s8 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s9 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s11 -; GFX940-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_nop 1 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s4 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s5 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s6 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s7 -; GFX940-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: v8i32_arg: +; GFX90a-NO-PRELOAD-LABEL: v8i32_kernel_preload_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -1090,87 +674,51 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> ; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-1-LABEL: v8i32_arg: -; GFX90a-PRELOAD-1: ; %bb.0: -; GFX90a-PRELOAD-1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 -; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v4, 0 -; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s12 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s13 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s14 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v3, s15 -; GFX90a-PRELOAD-1-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s8 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s9 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s10 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v3, s11 -; GFX90a-PRELOAD-1-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] -; GFX90a-PRELOAD-1-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: v8i32_arg: -; GFX90a-PRELOAD-2: ; %bb.0: +; GFX90a-PRELOAD-2-LABEL: v8i32_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v4, 0 ; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s12 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s13 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s14 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s15 -; GFX90a-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 +; GFX90a-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16 ; GFX90a-PRELOAD-2-NEXT: s_nop 0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s8 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s9 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s10 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s11 -; GFX90a-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX90a-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-4-LABEL: v8i32_arg: -; GFX90a-PRELOAD-4: ; %bb.0: -; GFX90a-PRELOAD-4-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 -; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v4, 0 -; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s12 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s13 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s14 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v3, s15 -; GFX90a-PRELOAD-4-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s8 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s9 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s10 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v3, s11 -; GFX90a-PRELOAD-4-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] -; GFX90a-PRELOAD-4-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: v8i32_arg: -; GFX90a-PRELOAD-8: ; %bb.0: +; GFX90a-PRELOAD-8-LABEL: v8i32_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v4, 0 ; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s12 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s13 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s14 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s15 -; GFX90a-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 +; GFX90a-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16 ; GFX90a-PRELOAD-8-NEXT: s_nop 0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s8 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s9 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s10 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s11 -; GFX90a-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX90a-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX90a-PRELOAD-8-NEXT: s_endpgm store <8 x i32> %in, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture %out, <3 x i16> %in) #0 { -; GFX940-NO-PRELOAD-LABEL: v3i16_preload_arg: +define amdgpu_kernel void @v3i16_kernel_preload_arg(ptr addrspace(1) nocapture %out, <3 x i16> %in) #0 { +; GFX940-NO-PRELOAD-LABEL: v3i16_kernel_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 @@ -1181,51 +729,29 @@ define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture %out, <3 ; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-1-LABEL: v3i16_preload_arg: -; GFX940-PRELOAD-1: ; %bb.0: -; GFX940-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s2 -; GFX940-PRELOAD-1-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 -; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 -; GFX940-PRELOAD-1-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: v3i16_preload_arg: -; GFX940-PRELOAD-2: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-PRELOAD-2-LABEL: v3i16_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s2 -; GFX940-PRELOAD-2-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 -; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-PRELOAD-2-NEXT: global_store_short v0, v1, s[2:3] offset:4 sc0 sc1 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-4-LABEL: v3i16_preload_arg: -; GFX940-PRELOAD-4: ; %bb.0: -; GFX940-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s2 -; GFX940-PRELOAD-4-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 -; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 -; GFX940-PRELOAD-4-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: v3i16_preload_arg: -; GFX940-PRELOAD-8: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-PRELOAD-8-LABEL: v3i16_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s2 -; GFX940-PRELOAD-8-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 -; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-PRELOAD-8-NEXT: global_store_short v0, v1, s[2:3] offset:4 sc0 sc1 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: v3i16_preload_arg: +; GFX90a-NO-PRELOAD-LABEL: v3i16_kernel_preload_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 @@ -1236,55 +762,33 @@ define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture %out, <3 ; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[0:1] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-1-LABEL: v3i16_preload_arg: -; GFX90a-PRELOAD-1: ; %bb.0: -; GFX90a-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s3 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s2 -; GFX90a-PRELOAD-1-NEXT: global_store_short v0, v1, s[0:1] offset:4 -; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v2, s[0:1] -; GFX90a-PRELOAD-1-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: v3i16_preload_arg: -; GFX90a-PRELOAD-2: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90a-PRELOAD-2-LABEL: v3i16_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s3 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s2 -; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[0:1] offset:4 -; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v2, s[0:1] +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s9 +; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[6:7] offset:4 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8 +; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-4-LABEL: v3i16_preload_arg: -; GFX90a-PRELOAD-4: ; %bb.0: -; GFX90a-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s3 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s2 -; GFX90a-PRELOAD-4-NEXT: global_store_short v0, v1, s[0:1] offset:4 -; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v2, s[0:1] -; GFX90a-PRELOAD-4-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: v3i16_preload_arg: -; GFX90a-PRELOAD-8: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90a-PRELOAD-8-LABEL: v3i16_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s3 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s2 -; GFX90a-PRELOAD-8-NEXT: global_store_short v0, v1, s[0:1] offset:4 -; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v2, s[0:1] +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s9 +; GFX90a-PRELOAD-8-NEXT: global_store_short v0, v1, s[6:7] offset:4 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s8 +; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7] ; GFX90a-PRELOAD-8-NEXT: s_endpgm store <3 x i16> %in, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture %out, <3 x i32> %in) #0 { -; GFX940-NO-PRELOAD-LABEL: v3i32_preload_arg: +define amdgpu_kernel void @v3i32_kernel_preload_arg(ptr addrspace(1) nocapture %out, <3 x i32> %in) #0 { +; GFX940-NO-PRELOAD-LABEL: v3i32_kernel_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -1296,55 +800,29 @@ define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture %out, <3 ; GFX940-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-1-LABEL: v3i32_preload_arg: -; GFX940-PRELOAD-1: ; %bb.0: -; GFX940-PRELOAD-1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v3, 0 -; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s6 -; GFX940-PRELOAD-1-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 -; GFX940-PRELOAD-1-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: v3i32_preload_arg: -; GFX940-PRELOAD-2: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-2-LABEL: v3i32_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s7 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 -; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s6 ; GFX940-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-4-LABEL: v3i32_preload_arg: -; GFX940-PRELOAD-4: ; %bb.0: -; GFX940-PRELOAD-4-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v3, 0 -; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s6 -; GFX940-PRELOAD-4-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 -; GFX940-PRELOAD-4-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: v3i32_preload_arg: -; GFX940-PRELOAD-8: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-8-LABEL: v3i32_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s7 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 -; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s6 ; GFX940-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: v3i32_preload_arg: +; GFX90a-NO-PRELOAD-LABEL: v3i32_kernel_preload_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 @@ -1356,59 +834,33 @@ define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture %out, <3 ; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-1-LABEL: v3i32_preload_arg: -; GFX90a-PRELOAD-1: ; %bb.0: -; GFX90a-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s0 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s1 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s2 -; GFX90a-PRELOAD-1-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] -; GFX90a-PRELOAD-1-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: v3i32_preload_arg: -; GFX90a-PRELOAD-2: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX90a-PRELOAD-2-LABEL: v3i32_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s10 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s11 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s12 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s0 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s1 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s2 ; GFX90a-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-4-LABEL: v3i32_preload_arg: -; GFX90a-PRELOAD-4: ; %bb.0: -; GFX90a-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s0 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s1 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s2 -; GFX90a-PRELOAD-4-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] -; GFX90a-PRELOAD-4-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: v3i32_preload_arg: -; GFX90a-PRELOAD-8: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX90a-PRELOAD-8-LABEL: v3i32_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s10 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s11 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s12 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s0 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s1 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s2 ; GFX90a-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] ; GFX90a-PRELOAD-8-NEXT: s_endpgm store <3 x i32> %in, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture %out, <3 x float> %in) #0 { -; GFX940-NO-PRELOAD-LABEL: v3f32_preload_arg: +define amdgpu_kernel void @v3f32_kernel_preload_arg(ptr addrspace(1) nocapture %out, <3 x float> %in) #0 { +; GFX940-NO-PRELOAD-LABEL: v3f32_kernel_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -1420,55 +872,29 @@ define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture %out, <3 ; GFX940-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-1-LABEL: v3f32_preload_arg: -; GFX940-PRELOAD-1: ; %bb.0: -; GFX940-PRELOAD-1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v3, 0 -; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s6 -; GFX940-PRELOAD-1-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 -; GFX940-PRELOAD-1-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: v3f32_preload_arg: -; GFX940-PRELOAD-2: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-2-LABEL: v3f32_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 -; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s7 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-4-LABEL: v3f32_preload_arg: -; GFX940-PRELOAD-4: ; %bb.0: -; GFX940-PRELOAD-4-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v3, 0 -; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s6 -; GFX940-PRELOAD-4-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 -; GFX940-PRELOAD-4-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: v3f32_preload_arg: -; GFX940-PRELOAD-8: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-8-LABEL: v3f32_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 -; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s7 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: v3f32_preload_arg: +; GFX90a-NO-PRELOAD-LABEL: v3f32_kernel_preload_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 @@ -1480,59 +906,33 @@ define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture %out, <3 ; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-1-LABEL: v3f32_preload_arg: -; GFX90a-PRELOAD-1: ; %bb.0: -; GFX90a-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s0 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s1 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s2 -; GFX90a-PRELOAD-1-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] -; GFX90a-PRELOAD-1-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: v3f32_preload_arg: -; GFX90a-PRELOAD-2: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX90a-PRELOAD-2-LABEL: v3f32_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s0 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s1 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s2 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s10 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s11 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s12 ; GFX90a-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-4-LABEL: v3f32_preload_arg: -; GFX90a-PRELOAD-4: ; %bb.0: -; GFX90a-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s0 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s1 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s2 -; GFX90a-PRELOAD-4-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] -; GFX90a-PRELOAD-4-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: v3f32_preload_arg: -; GFX90a-PRELOAD-8: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX90a-PRELOAD-8-LABEL: v3f32_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s0 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s1 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s2 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s10 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s11 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s12 ; GFX90a-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] ; GFX90a-PRELOAD-8-NEXT: s_endpgm store <3 x float> %in, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture %out, <5 x i8> %in) #0 { -; GFX940-NO-PRELOAD-LABEL: v5i8_preload_arg: +define amdgpu_kernel void @v5i8_kernel_preload_arg(ptr addrspace(1) nocapture %out, <5 x i8> %in) #0 { +; GFX940-NO-PRELOAD-LABEL: v5i8_kernel_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 @@ -1543,51 +943,43 @@ define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture %out, <5 ; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-1-LABEL: v5i8_preload_arg: -; GFX940-PRELOAD-1: ; %bb.0: -; GFX940-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s2 -; GFX940-PRELOAD-1-NEXT: global_store_byte v0, v1, s[0:1] offset:4 sc0 sc1 -; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 -; GFX940-PRELOAD-1-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: v5i8_preload_arg: -; GFX940-PRELOAD-2: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s2 -; GFX940-PRELOAD-2-NEXT: global_store_byte v0, v1, s[0:1] offset:4 sc0 sc1 -; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-PRELOAD-2-LABEL: v5i8_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 8 +; GFX940-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 24 +; GFX940-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v1, 8, s0 +; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 16 +; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s5 +; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-PRELOAD-2-NEXT: global_store_byte v1, v2, s[2:3] offset:4 sc0 sc1 +; GFX940-PRELOAD-2-NEXT: global_store_dword v1, v0, s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-4-LABEL: v5i8_preload_arg: -; GFX940-PRELOAD-4: ; %bb.0: -; GFX940-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s2 -; GFX940-PRELOAD-4-NEXT: global_store_byte v0, v1, s[0:1] offset:4 sc0 sc1 -; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 -; GFX940-PRELOAD-4-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: v5i8_preload_arg: -; GFX940-PRELOAD-8: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s2 -; GFX940-PRELOAD-8-NEXT: global_store_byte v0, v1, s[0:1] offset:4 sc0 sc1 -; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-PRELOAD-8-LABEL: v5i8_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 8 +; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 24 +; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v1, 8, s0 +; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 16 +; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s5 +; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-PRELOAD-8-NEXT: global_store_byte v1, v2, s[2:3] offset:4 sc0 sc1 +; GFX940-PRELOAD-8-NEXT: global_store_dword v1, v0, s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: v5i8_preload_arg: +; GFX90a-NO-PRELOAD-LABEL: v5i8_kernel_preload_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 @@ -1598,55 +990,47 @@ define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture %out, <5 ; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[0:1] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-1-LABEL: v5i8_preload_arg: -; GFX90a-PRELOAD-1: ; %bb.0: -; GFX90a-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s3 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s2 -; GFX90a-PRELOAD-1-NEXT: global_store_byte v0, v1, s[0:1] offset:4 -; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v2, s[0:1] -; GFX90a-PRELOAD-1-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: v5i8_preload_arg: -; GFX90a-PRELOAD-2: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s3 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s2 -; GFX90a-PRELOAD-2-NEXT: global_store_byte v0, v1, s[0:1] offset:4 -; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v2, s[0:1] +; GFX90a-PRELOAD-2-LABEL: v5i8_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 8 +; GFX90a-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 24 +; GFX90a-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v1, 8, s0 +; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 16 +; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, 0 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s9 +; GFX90a-PRELOAD-2-NEXT: global_store_byte v1, v2, s[6:7] offset:4 +; GFX90a-PRELOAD-2-NEXT: global_store_dword v1, v0, s[6:7] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-4-LABEL: v5i8_preload_arg: -; GFX90a-PRELOAD-4: ; %bb.0: -; GFX90a-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s3 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s2 -; GFX90a-PRELOAD-4-NEXT: global_store_byte v0, v1, s[0:1] offset:4 -; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v2, s[0:1] -; GFX90a-PRELOAD-4-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: v5i8_preload_arg: -; GFX90a-PRELOAD-8: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s3 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s2 -; GFX90a-PRELOAD-8-NEXT: global_store_byte v0, v1, s[0:1] offset:4 -; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v2, s[0:1] +; GFX90a-PRELOAD-8-LABEL: v5i8_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 8 +; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 24 +; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v1, 8, s0 +; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 16 +; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, 0 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s9 +; GFX90a-PRELOAD-8-NEXT: global_store_byte v1, v2, s[6:7] offset:4 +; GFX90a-PRELOAD-8-NEXT: global_store_dword v1, v0, s[6:7] ; GFX90a-PRELOAD-8-NEXT: s_endpgm store <5 x i8> %in, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x double> %in) #0 { -; GFX940-NO-PRELOAD-LABEL: v5f64_arg: +define amdgpu_kernel void @v5f64_kernel_preload_arg(ptr addrspace(1) nocapture %out, <5 x double> %in) #0 { +; GFX940-NO-PRELOAD-LABEL: v5f64_kernel_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x60 ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x40 @@ -1668,95 +1052,53 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x doubl ; GFX940-NO-PRELOAD-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-1-LABEL: v5f64_arg: -; GFX940-PRELOAD-1: ; %bb.0: -; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x60 -; GFX940-PRELOAD-1-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x40 -; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x0 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-1-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s8 -; GFX940-PRELOAD-1-NEXT: global_store_dwordx2 v4, v[2:3], s[12:13] offset:32 sc0 sc1 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s9 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s10 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v3, s11 -; GFX940-PRELOAD-1-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16 sc0 sc1 -; GFX940-PRELOAD-1-NEXT: s_nop 1 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s6 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v3, s7 -; GFX940-PRELOAD-1-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] sc0 sc1 -; GFX940-PRELOAD-1-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: v5f64_arg: -; GFX940-PRELOAD-2: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x60 +; GFX940-PRELOAD-2-LABEL: v5f64_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x60 ; GFX940-PRELOAD-2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x40 -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x0 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v4, 0 ; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX940-PRELOAD-2-NEXT: v_mov_b64_e32 v[2:3], s[12:13] ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s8 -; GFX940-PRELOAD-2-NEXT: global_store_dwordx2 v4, v[2:3], s[12:13] offset:32 sc0 sc1 +; GFX940-PRELOAD-2-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 sc0 sc1 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s9 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s11 -; GFX940-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16 sc0 sc1 +; GFX940-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_nop 1 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s4 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s5 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s6 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s7 -; GFX940-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-4-LABEL: v5f64_arg: -; GFX940-PRELOAD-4: ; %bb.0: -; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x60 -; GFX940-PRELOAD-4-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x40 -; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x0 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-4-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s8 -; GFX940-PRELOAD-4-NEXT: global_store_dwordx2 v4, v[2:3], s[12:13] offset:32 sc0 sc1 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s9 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s10 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v3, s11 -; GFX940-PRELOAD-4-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16 sc0 sc1 -; GFX940-PRELOAD-4-NEXT: s_nop 1 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s6 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v3, s7 -; GFX940-PRELOAD-4-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] sc0 sc1 -; GFX940-PRELOAD-4-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: v5f64_arg: -; GFX940-PRELOAD-8: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x60 +; GFX940-PRELOAD-8-LABEL: v5f64_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x60 ; GFX940-PRELOAD-8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x40 -; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x0 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v4, 0 ; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-8-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX940-PRELOAD-8-NEXT: v_mov_b64_e32 v[2:3], s[12:13] ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s8 -; GFX940-PRELOAD-8-NEXT: global_store_dwordx2 v4, v[2:3], s[12:13] offset:32 sc0 sc1 +; GFX940-PRELOAD-8-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 sc0 sc1 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s9 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s11 -; GFX940-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16 sc0 sc1 +; GFX940-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_nop 1 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s4 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s5 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s6 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s7 -; GFX940-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: v5f64_arg: +; GFX90a-NO-PRELOAD-LABEL: v5f64_kernel_preload_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60 ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 @@ -1778,99 +1120,57 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x doubl ; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-1-LABEL: v5f64_arg: -; GFX90a-PRELOAD-1: ; %bb.0: -; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60 -; GFX90a-PRELOAD-1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 -; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v4, 0 -; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-1-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s12 -; GFX90a-PRELOAD-1-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s13 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s14 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v3, s15 -; GFX90a-PRELOAD-1-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s8 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s9 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s10 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v3, s11 -; GFX90a-PRELOAD-1-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] -; GFX90a-PRELOAD-1-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: v5f64_arg: -; GFX90a-PRELOAD-2: ; %bb.0: +; GFX90a-PRELOAD-2-LABEL: v5f64_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60 ; GFX90a-PRELOAD-2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v4, 0 ; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-PRELOAD-2-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s12 -; GFX90a-PRELOAD-2-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 +; GFX90a-PRELOAD-2-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7] offset:32 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s13 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s14 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s15 -; GFX90a-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 +; GFX90a-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16 ; GFX90a-PRELOAD-2-NEXT: s_nop 0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s8 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s9 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s10 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s11 -; GFX90a-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX90a-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-4-LABEL: v5f64_arg: -; GFX90a-PRELOAD-4: ; %bb.0: -; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60 -; GFX90a-PRELOAD-4-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 -; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v4, 0 -; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-4-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s12 -; GFX90a-PRELOAD-4-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s13 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s14 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v3, s15 -; GFX90a-PRELOAD-4-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s8 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s9 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s10 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v3, s11 -; GFX90a-PRELOAD-4-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] -; GFX90a-PRELOAD-4-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: v5f64_arg: -; GFX90a-PRELOAD-8: ; %bb.0: +; GFX90a-PRELOAD-8-LABEL: v5f64_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60 ; GFX90a-PRELOAD-8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v4, 0 ; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-PRELOAD-8-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s12 -; GFX90a-PRELOAD-8-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 +; GFX90a-PRELOAD-8-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7] offset:32 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s13 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s14 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s15 -; GFX90a-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 +; GFX90a-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16 ; GFX90a-PRELOAD-8-NEXT: s_nop 0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s8 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s9 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s10 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s11 -; GFX90a-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX90a-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX90a-PRELOAD-8-NEXT: s_endpgm store <5 x double> %in, ptr addrspace(1) %out, align 8 ret void } -define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) %out, <8 x i8> %in) #0 { -; GFX940-NO-PRELOAD-LABEL: v8i8_preload_arg: +define amdgpu_kernel void @v8i8_kernel_preload_arg(ptr addrspace(1) %out, <8 x i8> %in) #0 { +; GFX940-NO-PRELOAD-LABEL: v8i8_kernel_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, 0 @@ -1879,43 +1179,57 @@ define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) %out, <8 x i8> %in) ; GFX940-NO-PRELOAD-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-1-LABEL: v8i8_preload_arg: -; GFX940-PRELOAD-1: ; %bb.0: -; GFX940-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-1-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-PRELOAD-1-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 -; GFX940-PRELOAD-1-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: v8i8_preload_arg: -; GFX940-PRELOAD-2: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-PRELOAD-2-LABEL: v8i8_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s5, 8 +; GFX940-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s5, 24 +; GFX940-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v1, 8, s0 +; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s5, 16 +; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 8 +; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX940-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 24 +; GFX940-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v2, 8, s0 +; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 16 +; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX940-PRELOAD-2-NEXT: s_nop 0 +; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-4-LABEL: v8i8_preload_arg: -; GFX940-PRELOAD-4: ; %bb.0: -; GFX940-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-4-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-PRELOAD-4-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 -; GFX940-PRELOAD-4-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: v8i8_preload_arg: -; GFX940-PRELOAD-8: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-PRELOAD-8-LABEL: v8i8_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s5, 8 +; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s5, 24 +; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v1, 8, s0 +; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s5, 16 +; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 8 +; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 24 +; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v2, 8, s0 +; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 16 +; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX940-PRELOAD-8-NEXT: s_nop 0 +; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-8-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: v8i8_preload_arg: +; GFX90a-NO-PRELOAD-LABEL: v8i8_kernel_preload_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, 0 @@ -1924,40 +1238,52 @@ define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) %out, <8 x i8> %in) ; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-1-LABEL: v8i8_preload_arg: -; GFX90a-PRELOAD-1: ; %bb.0: -; GFX90a-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, 0 -; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-1-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90a-PRELOAD-1-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX90a-PRELOAD-1-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: v8i8_preload_arg: -; GFX90a-PRELOAD-2: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90a-PRELOAD-2-LABEL: v8i8_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s9, 8 +; GFX90a-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s9, 24 +; GFX90a-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v1, 8, s0 +; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s9, 16 +; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v0, s9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 8 +; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX90a-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 24 +; GFX90a-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v2, 8, s0 +; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 16 +; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0 -; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-2-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90a-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX90a-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-4-LABEL: v8i8_preload_arg: -; GFX90a-PRELOAD-4: ; %bb.0: -; GFX90a-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, 0 -; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-4-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90a-PRELOAD-4-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX90a-PRELOAD-4-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: v8i8_preload_arg: -; GFX90a-PRELOAD-8: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90a-PRELOAD-8-LABEL: v8i8_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s9, 8 +; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s9, 24 +; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v1, 8, s0 +; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s9, 16 +; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 8 +; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 24 +; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v2, 8, s0 +; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 16 +; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0 -; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-8-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90a-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX90a-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX90a-PRELOAD-8-NEXT: s_endpgm store <8 x i8> %in, ptr addrspace(1) %out ret void @@ -1974,44 +1300,22 @@ define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) %out, i64 %a) ; GFX940-NO-PRELOAD-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-1-LABEL: i64_kernel_preload_arg: -; GFX940-PRELOAD-1: ; %bb.0: -; GFX940-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s2 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-PRELOAD-1-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 -; GFX940-PRELOAD-1-NEXT: s_endpgm -; ; GFX940-PRELOAD-2-LABEL: i64_kernel_preload_arg: -; GFX940-PRELOAD-2: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s2 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: v_mov_b64_e32 v[0:1], s[4:5] +; GFX940-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-4-LABEL: i64_kernel_preload_arg: -; GFX940-PRELOAD-4: ; %bb.0: -; GFX940-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s2 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-PRELOAD-4-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 -; GFX940-PRELOAD-4-NEXT: s_endpgm -; ; GFX940-PRELOAD-8-LABEL: i64_kernel_preload_arg: -; GFX940-PRELOAD-8: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s2 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: v_mov_b64_e32 v[0:1], s[4:5] +; GFX940-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; ; GFX90a-NO-PRELOAD-LABEL: i64_kernel_preload_arg: @@ -2024,44 +1328,22 @@ define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) %out, i64 %a) ; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-1-LABEL: i64_kernel_preload_arg: -; GFX90a-PRELOAD-1: ; %bb.0: -; GFX90a-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, 0 -; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s2 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s3 -; GFX90a-PRELOAD-1-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX90a-PRELOAD-1-NEXT: s_endpgm -; ; GFX90a-PRELOAD-2-LABEL: i64_kernel_preload_arg: -; GFX90a-PRELOAD-2: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0 -; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s2 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s3 -; GFX90a-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX90a-PRELOAD-2-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1] +; GFX90a-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-4-LABEL: i64_kernel_preload_arg: -; GFX90a-PRELOAD-4: ; %bb.0: -; GFX90a-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, 0 -; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s2 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s3 -; GFX90a-PRELOAD-4-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX90a-PRELOAD-4-NEXT: s_endpgm -; ; GFX90a-PRELOAD-8-LABEL: i64_kernel_preload_arg: -; GFX90a-PRELOAD-8: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0 -; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s2 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s3 -; GFX90a-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX90a-PRELOAD-8-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1] +; GFX90a-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX90a-PRELOAD-8-NEXT: s_endpgm store i64 %a, ptr addrspace(1) %out, align 8 ret void @@ -2078,44 +1360,22 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) %out, double ; GFX940-NO-PRELOAD-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-1-LABEL: f64_kernel_preload_arg: -; GFX940-PRELOAD-1: ; %bb.0: -; GFX940-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s2 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-PRELOAD-1-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 -; GFX940-PRELOAD-1-NEXT: s_endpgm -; ; GFX940-PRELOAD-2-LABEL: f64_kernel_preload_arg: -; GFX940-PRELOAD-2: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s2 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: v_mov_b64_e32 v[0:1], s[4:5] +; GFX940-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-4-LABEL: f64_kernel_preload_arg: -; GFX940-PRELOAD-4: ; %bb.0: -; GFX940-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s2 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-PRELOAD-4-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 -; GFX940-PRELOAD-4-NEXT: s_endpgm -; ; GFX940-PRELOAD-8-LABEL: f64_kernel_preload_arg: -; GFX940-PRELOAD-8: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s2 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: v_mov_b64_e32 v[0:1], s[4:5] +; GFX940-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; ; GFX90a-NO-PRELOAD-LABEL: f64_kernel_preload_arg: @@ -2128,47 +1388,1137 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) %out, double ; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-1-LABEL: f64_kernel_preload_arg: -; GFX90a-PRELOAD-1: ; %bb.0: -; GFX90a-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, 0 -; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s2 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s3 -; GFX90a-PRELOAD-1-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX90a-PRELOAD-1-NEXT: s_endpgm -; ; GFX90a-PRELOAD-2-LABEL: f64_kernel_preload_arg: -; GFX90a-PRELOAD-2: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0 -; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s2 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s3 -; GFX90a-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX90a-PRELOAD-2-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1] +; GFX90a-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-4-LABEL: f64_kernel_preload_arg: -; GFX90a-PRELOAD-4: ; %bb.0: -; GFX90a-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, 0 -; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s2 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s3 -; GFX90a-PRELOAD-4-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX90a-PRELOAD-4-NEXT: s_endpgm -; ; GFX90a-PRELOAD-8-LABEL: f64_kernel_preload_arg: -; GFX90a-PRELOAD-8: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0 -; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s2 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s3 -; GFX90a-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX90a-PRELOAD-8-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1] +; GFX90a-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX90a-PRELOAD-8-NEXT: s_endpgm store double %in, ptr addrspace(1) %out ret void } -attributes #0 = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } +define amdgpu_kernel void @half_kernel_preload_arg(ptr addrspace(1) %out, half %in) #0 { +; GFX940-NO-PRELOAD-LABEL: half_kernel_preload_arg: +; GFX940-NO-PRELOAD: ; %bb.0: +; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NO-PRELOAD-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: half_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-PRELOAD-2-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: half_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-PRELOAD-8-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: s_endpgm +; +; GFX90a-NO-PRELOAD-LABEL: half_kernel_preload_arg: +; GFX90a-NO-PRELOAD: ; %bb.0: +; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1] +; GFX90a-NO-PRELOAD-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: half_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8 +; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[6:7] +; GFX90a-PRELOAD-2-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: half_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s8 +; GFX90a-PRELOAD-8-NEXT: global_store_short v0, v1, s[6:7] +; GFX90a-PRELOAD-8-NEXT: s_endpgm + store half %in, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @bfloat_kernel_preload_arg(ptr addrspace(1) %out, bfloat %in) #0 { +; GFX940-NO-PRELOAD-LABEL: bfloat_kernel_preload_arg: +; GFX940-NO-PRELOAD: ; %bb.0: +; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NO-PRELOAD-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: bfloat_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-PRELOAD-2-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: bfloat_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-PRELOAD-8-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: s_endpgm +; +; GFX90a-NO-PRELOAD-LABEL: bfloat_kernel_preload_arg: +; GFX90a-NO-PRELOAD: ; %bb.0: +; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1] +; GFX90a-NO-PRELOAD-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: bfloat_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8 +; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[6:7] +; GFX90a-PRELOAD-2-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: bfloat_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s8 +; GFX90a-PRELOAD-8-NEXT: global_store_short v0, v1, s[6:7] +; GFX90a-PRELOAD-8-NEXT: s_endpgm + store bfloat %in, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v2bfloat_kernel_preload_arg(ptr addrspace(1) %out, <2 x bfloat> %in) #0 { +; GFX940-NO-PRELOAD-LABEL: v2bfloat_kernel_preload_arg: +; GFX940-NO-PRELOAD: ; %bb.0: +; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: v2bfloat_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: v2bfloat_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: s_endpgm +; +; GFX90a-NO-PRELOAD-LABEL: v2bfloat_kernel_preload_arg: +; GFX90a-NO-PRELOAD: ; %bb.0: +; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90a-NO-PRELOAD-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: v2bfloat_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8 +; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-PRELOAD-2-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: v2bfloat_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s8 +; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-PRELOAD-8-NEXT: s_endpgm + store <2 x bfloat> %in, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v3bfloat_kernel_preload_arg(ptr addrspace(1) %out, <3 x bfloat> %in) #0 { +; GFX940-NO-PRELOAD-LABEL: v3bfloat_kernel_preload_arg: +; GFX940-NO-PRELOAD: ; %bb.0: +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s2 +; GFX940-NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: v3bfloat_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-PRELOAD-2-NEXT: global_store_short v0, v1, s[2:3] offset:4 sc0 sc1 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: v3bfloat_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-PRELOAD-8-NEXT: global_store_short v0, v1, s[2:3] offset:4 sc0 sc1 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: s_endpgm +; +; GFX90a-NO-PRELOAD-LABEL: v3bfloat_kernel_preload_arg: +; GFX90a-NO-PRELOAD: ; %bb.0: +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s3 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s2 +; GFX90a-NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1] offset:4 +; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[0:1] +; GFX90a-NO-PRELOAD-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: v3bfloat_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s9 +; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[6:7] offset:4 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8 +; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-PRELOAD-2-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: v3bfloat_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s9 +; GFX90a-PRELOAD-8-NEXT: global_store_short v0, v1, s[6:7] offset:4 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s8 +; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-PRELOAD-8-NEXT: s_endpgm + store <3 x bfloat> %in, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v6bfloat_kernel_preload_arg(ptr addrspace(1) %out, <6 x bfloat> %in) #0 { +; GFX940-NO-PRELOAD-LABEL: v6bfloat_kernel_preload_arg: +; GFX940-NO-PRELOAD: ; %bb.0: +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: v6bfloat_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s7 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s8 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: v6bfloat_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s7 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s8 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: s_endpgm +; +; GFX90a-NO-PRELOAD-LABEL: v6bfloat_kernel_preload_arg: +; GFX90a-NO-PRELOAD: ; %bb.0: +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v3, 0 +; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s0 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s1 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s2 +; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; GFX90a-NO-PRELOAD-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: v6bfloat_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s10 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s11 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s12 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 +; GFX90a-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; GFX90a-PRELOAD-2-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: v6bfloat_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s10 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s11 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s12 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 +; GFX90a-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; GFX90a-PRELOAD-8-NEXT: s_endpgm + store <6 x bfloat> %in, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @half_v7bfloat_kernel_preload_arg(ptr addrspace(1) %out, half %in, <7 x bfloat> %in2, ptr addrspace(1) %out2) #0 { +; GFX940-NO-PRELOAD-LABEL: half_v7bfloat_kernel_preload_arg: +; GFX940-NO-PRELOAD: ; %bb.0: +; GFX940-NO-PRELOAD-NEXT: s_load_dword s10, s[0:1], 0x8 +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x20 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s10 +; GFX940-NO-PRELOAD-NEXT: global_store_short v3, v0, s[2:3] sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s7 +; GFX940-NO-PRELOAD-NEXT: global_store_short v3, v0, s[8:9] offset:12 sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9] sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: half_v7bfloat_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x10 +; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-PRELOAD-2-NEXT: global_store_short v3, v0, s[2:3] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s11 +; GFX940-PRELOAD-2-NEXT: global_store_short v3, v0, s[6:7] offset:12 sc0 sc1 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s10 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s8 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s9 +; GFX940-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: half_v7bfloat_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-PRELOAD-8-NEXT: global_store_short v3, v0, s[2:3] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s9 +; GFX940-PRELOAD-8-NEXT: global_store_short v3, v0, s[10:11] offset:12 sc0 sc1 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s8 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s7 +; GFX940-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[10:11] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: s_endpgm +; +; GFX90a-NO-PRELOAD-LABEL: half_v7bfloat_kernel_preload_arg: +; GFX90a-NO-PRELOAD: ; %bb.0: +; GFX90a-NO-PRELOAD-NEXT: s_load_dword s10, s[4:5], 0x8 +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x20 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v3, 0 +; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s10 +; GFX90a-NO-PRELOAD-NEXT: global_store_short v3, v0, s[6:7] +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s3 +; GFX90a-NO-PRELOAD-NEXT: global_store_short v3, v0, s[8:9] offset:12 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s2 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s0 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s1 +; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9] +; GFX90a-NO-PRELOAD-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: half_v7bfloat_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x20 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s8 +; GFX90a-PRELOAD-2-NEXT: global_store_short v3, v0, s[6:7] +; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s3 +; GFX90a-PRELOAD-2-NEXT: global_store_short v3, v0, s[10:11] offset:12 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s2 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s0 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s1 +; GFX90a-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[10:11] +; GFX90a-PRELOAD-2-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: half_v7bfloat_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x20 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s8 +; GFX90a-PRELOAD-8-NEXT: global_store_short v3, v0, s[6:7] +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s13 +; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-8-NEXT: global_store_short v3, v0, s[0:1] offset:12 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s12 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s10 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s11 +; GFX90a-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX90a-PRELOAD-8-NEXT: s_endpgm + store half %in, ptr addrspace(1) %out + store <7 x bfloat> %in2, ptr addrspace(1) %out2 + ret void +} + +define amdgpu_kernel void @i1_kernel_preload_arg(ptr addrspace(1) %out, i1 %in) #0 { +; GFX940-NO-PRELOAD-LABEL: i1_kernel_preload_arg: +; GFX940-NO-PRELOAD: ; %bb.0: +; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NO-PRELOAD-NEXT: s_and_b32 s0, s4, 1 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NO-PRELOAD-NEXT: global_store_byte v0, v1, s[2:3] sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: i1_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_and_b32 s0, s4, 1 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-PRELOAD-2-NEXT: global_store_byte v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: i1_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_and_b32 s0, s4, 1 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-PRELOAD-8-NEXT: global_store_byte v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: s_endpgm +; +; GFX90a-NO-PRELOAD-LABEL: i1_kernel_preload_arg: +; GFX90a-NO-PRELOAD: ; %bb.0: +; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-NO-PRELOAD-NEXT: s_and_b32 s2, s2, 1 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-NO-PRELOAD-NEXT: global_store_byte v0, v1, s[0:1] +; GFX90a-NO-PRELOAD-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: i1_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_and_b32 s0, s8, 1 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 +; GFX90a-PRELOAD-2-NEXT: global_store_byte v0, v1, s[6:7] +; GFX90a-PRELOAD-2-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: i1_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_and_b32 s0, s8, 1 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 +; GFX90a-PRELOAD-8-NEXT: global_store_byte v0, v1, s[6:7] +; GFX90a-PRELOAD-8-NEXT: s_endpgm + store i1 %in, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @fp128_kernel_preload_arg(ptr addrspace(1) %out, fp128 %in) #0 { +; GFX940-NO-PRELOAD-LABEL: fp128_kernel_preload_arg: +; GFX940-NO-PRELOAD: ; %bb.0: +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NO-PRELOAD-NEXT: v_mov_b64_e32 v[0:1], s[4:5] +; GFX940-NO-PRELOAD-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX940-NO-PRELOAD-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: fp128_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s7 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s8 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s9 +; GFX940-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: fp128_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s7 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s8 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s9 +; GFX940-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: s_endpgm +; +; GFX90a-NO-PRELOAD-LABEL: fp128_kernel_preload_arg: +; GFX90a-NO-PRELOAD: ; %bb.0: +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v4, 0 +; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-NO-PRELOAD-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90a-NO-PRELOAD-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GFX90a-NO-PRELOAD-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: fp128_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v4, 0 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s10 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s11 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s12 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s13 +; GFX90a-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GFX90a-PRELOAD-2-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: fp128_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v4, 0 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s10 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s11 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s12 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s13 +; GFX90a-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GFX90a-PRELOAD-8-NEXT: s_endpgm + store fp128 %in, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v7i8_kernel_preload_arg(ptr addrspace(1) %out, <7 x i8> %in) #0 { +; GFX940-NO-PRELOAD-LABEL: v7i8_kernel_preload_arg: +; GFX940-NO-PRELOAD: ; %bb.0: +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s2 +; GFX940-NO-PRELOAD-NEXT: global_store_byte_d16_hi v0, v1, s[0:1] offset:6 sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: v7i8_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 8 +; GFX940-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 24 +; GFX940-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v1, 8, s0 +; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 16 +; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s5, 8 +; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX940-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v1, 8, s0 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s5 +; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX940-PRELOAD-2-NEXT: global_store_byte_d16_hi v2, v3, s[2:3] offset:6 sc0 sc1 +; GFX940-PRELOAD-2-NEXT: global_store_short v2, v1, s[2:3] offset:4 sc0 sc1 +; GFX940-PRELOAD-2-NEXT: global_store_dword v2, v0, s[2:3] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: v7i8_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 8 +; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 24 +; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v1, 8, s0 +; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 16 +; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s5, 8 +; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v1, 8, s0 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s5 +; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX940-PRELOAD-8-NEXT: global_store_byte_d16_hi v2, v3, s[2:3] offset:6 sc0 sc1 +; GFX940-PRELOAD-8-NEXT: global_store_short v2, v1, s[2:3] offset:4 sc0 sc1 +; GFX940-PRELOAD-8-NEXT: global_store_dword v2, v0, s[2:3] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: s_endpgm +; +; GFX90a-NO-PRELOAD-LABEL: v7i8_kernel_preload_arg: +; GFX90a-NO-PRELOAD: ; %bb.0: +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s3 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s2 +; GFX90a-NO-PRELOAD-NEXT: global_store_byte_d16_hi v0, v1, s[0:1] offset:6 +; GFX90a-NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1] offset:4 +; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[0:1] +; GFX90a-NO-PRELOAD-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: v7i8_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 8 +; GFX90a-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 24 +; GFX90a-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v1, 8, s0 +; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 16 +; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s9, 8 +; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX90a-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v1, 8, s0 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s9 +; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v1, s9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX90a-PRELOAD-2-NEXT: global_store_byte_d16_hi v2, v3, s[6:7] offset:6 +; GFX90a-PRELOAD-2-NEXT: global_store_short v2, v1, s[6:7] offset:4 +; GFX90a-PRELOAD-2-NEXT: global_store_dword v2, v0, s[6:7] +; GFX90a-PRELOAD-2-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: v7i8_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 8 +; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 24 +; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v1, 8, s0 +; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 16 +; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s9, 8 +; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v1, 8, s0 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s9 +; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v1, s9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX90a-PRELOAD-8-NEXT: global_store_byte_d16_hi v2, v3, s[6:7] offset:6 +; GFX90a-PRELOAD-8-NEXT: global_store_short v2, v1, s[6:7] offset:4 +; GFX90a-PRELOAD-8-NEXT: global_store_dword v2, v0, s[6:7] +; GFX90a-PRELOAD-8-NEXT: s_endpgm + store <7 x i8> %in, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v7half_kernel_preload_arg(ptr addrspace(1) %out, <7 x half> %in) #0 { +; GFX940-NO-PRELOAD-LABEL: v7half_kernel_preload_arg: +; GFX940-NO-PRELOAD: ; %bb.0: +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s7 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-NO-PRELOAD-NEXT: global_store_short v3, v1, s[2:3] offset:12 sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: v7half_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s9 +; GFX940-PRELOAD-2-NEXT: global_store_short v3, v0, s[2:3] offset:12 sc0 sc1 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s8 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s7 +; GFX940-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: v7half_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s9 +; GFX940-PRELOAD-8-NEXT: global_store_short v3, v0, s[2:3] offset:12 sc0 sc1 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s8 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s7 +; GFX940-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: s_endpgm +; +; GFX90a-NO-PRELOAD-LABEL: v7half_kernel_preload_arg: +; GFX90a-NO-PRELOAD: ; %bb.0: +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v3, 0 +; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s3 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s2 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s0 +; GFX90a-NO-PRELOAD-NEXT: global_store_short v3, v1, s[6:7] offset:12 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s1 +; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; GFX90a-NO-PRELOAD-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: v7half_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s13 +; GFX90a-PRELOAD-2-NEXT: global_store_short v3, v0, s[6:7] offset:12 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s12 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s10 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s11 +; GFX90a-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; GFX90a-PRELOAD-2-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: v7half_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s13 +; GFX90a-PRELOAD-8-NEXT: global_store_short v3, v0, s[6:7] offset:12 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s12 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s10 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s11 +; GFX90a-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; GFX90a-PRELOAD-8-NEXT: s_endpgm + store <7 x half> %in, ptr addrspace(1) %out + ret void +} + +; Test when previous argument was not dword aligned. +define amdgpu_kernel void @i16_i32_kernel_preload_arg(ptr addrspace(1) %out, i16 %in, i32 %in2, ptr addrspace(1) %out2) #0 { +; GFX940-NO-PRELOAD-LABEL: i16_i32_kernel_preload_arg: +; GFX940-NO-PRELOAD: ; %bb.0: +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x10 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s7 +; GFX940-NO-PRELOAD-NEXT: global_store_short v0, v1, s[4:5] sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[2:3] sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: i16_i32_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dword s5, s[0:1], 0xc +; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x10 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-PRELOAD-2-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: i16_i32_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-PRELOAD-8-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: s_endpgm +; +; GFX90a-NO-PRELOAD-LABEL: i16_i32_kernel_preload_arg: +; GFX90a-NO-PRELOAD: ; %bb.0: +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s3 +; GFX90a-NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1] +; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[6:7] +; GFX90a-NO-PRELOAD-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: i16_i32_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8 +; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[6:7] +; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90a-PRELOAD-2-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: i16_i32_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s8 +; GFX90a-PRELOAD-8-NEXT: global_store_short v0, v1, s[6:7] +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s9 +; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[10:11] +; GFX90a-PRELOAD-8-NEXT: s_endpgm + store i16 %in, ptr addrspace(1) %out + store i32 %in2, ptr addrspace(1) %out2 + ret void +} + +define amdgpu_kernel void @i16_v3i32_kernel_preload_arg(ptr addrspace(1) %out, i16 %in, <3 x i32> %in2, ptr addrspace(1) %out2) #0 { +; GFX940-NO-PRELOAD-LABEL: i16_v3i32_kernel_preload_arg: +; GFX940-NO-PRELOAD: ; %bb.0: +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NO-PRELOAD-NEXT: s_load_dword s7, s[0:1], 0x8 +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x20 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v4, s7 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-NO-PRELOAD-NEXT: global_store_short v3, v4, s[2:3] sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9] sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: i16_v3i32_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x10 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x20 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v4, s4 +; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s8 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s9 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s10 +; GFX940-PRELOAD-2-NEXT: global_store_short v3, v4, s[2:3] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: i16_v3i32_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v4, s4 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s7 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s8 +; GFX940-PRELOAD-8-NEXT: global_store_short v3, v4, s[2:3] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[10:11] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: s_endpgm +; +; GFX90a-NO-PRELOAD-LABEL: i16_v3i32_kernel_preload_arg: +; GFX90a-NO-PRELOAD: ; %bb.0: +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-NO-PRELOAD-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x20 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v3, 0 +; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v4, s3 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s0 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s1 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s2 +; GFX90a-NO-PRELOAD-NEXT: global_store_short v3, v4, s[6:7] +; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9] +; GFX90a-NO-PRELOAD-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: i16_v3i32_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x20 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v4, s8 +; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s0 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s1 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s2 +; GFX90a-PRELOAD-2-NEXT: global_store_short v3, v4, s[6:7] +; GFX90a-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[4:5] +; GFX90a-PRELOAD-2-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: i16_v3i32_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x20 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v4, s8 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s10 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s11 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s12 +; GFX90a-PRELOAD-8-NEXT: global_store_short v3, v4, s[6:7] +; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX90a-PRELOAD-8-NEXT: s_endpgm + store i16 %in, ptr addrspace(1) %out + store <3 x i32> %in2, ptr addrspace(1) %out2 + ret void +} + +define amdgpu_kernel void @i16_i16_kernel_preload_arg(ptr addrspace(1) %out, i16 %in, i16 %in2, ptr addrspace(1) %out2) #0 { +; GFX940-NO-PRELOAD-LABEL: i16_i16_kernel_preload_arg: +; GFX940-NO-PRELOAD: ; %bb.0: +; GFX940-NO-PRELOAD-NEXT: s_load_dword s6, s[0:1], 0x8 +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x10 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NO-PRELOAD-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: global_store_short_d16_hi v0, v1, s[4:5] sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: i16_i16_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dword s5, s[0:1], 0x8 +; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x10 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-PRELOAD-2-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-PRELOAD-2-NEXT: global_store_short_d16_hi v0, v1, s[6:7] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: i16_i16_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-PRELOAD-8-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: global_store_short_d16_hi v0, v1, s[6:7] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: s_endpgm +; +; GFX90a-NO-PRELOAD-LABEL: i16_i16_kernel_preload_arg: +; GFX90a-NO-PRELOAD: ; %bb.0: +; GFX90a-NO-PRELOAD-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x10 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s6 +; GFX90a-NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1] +; GFX90a-NO-PRELOAD-NEXT: global_store_short_d16_hi v0, v1, s[2:3] +; GFX90a-NO-PRELOAD-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: i16_i16_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8 +; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[6:7] +; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-2-NEXT: global_store_short_d16_hi v0, v1, s[0:1] +; GFX90a-PRELOAD-2-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: i16_i16_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s8 +; GFX90a-PRELOAD-8-NEXT: global_store_short v0, v1, s[6:7] +; GFX90a-PRELOAD-8-NEXT: global_store_short_d16_hi v0, v1, s[10:11] +; GFX90a-PRELOAD-8-NEXT: s_endpgm + store i16 %in, ptr addrspace(1) %out + store i16 %in2, ptr addrspace(1) %out2 + ret void +} + +define amdgpu_kernel void @i16_v2i8_kernel_preload_arg(ptr addrspace(1) %out, i16 %in, <2 x i8> %in2, ptr addrspace(1) %out2) #0 { +; GFX940-NO-PRELOAD-LABEL: i16_v2i8_kernel_preload_arg: +; GFX940-NO-PRELOAD: ; %bb.0: +; GFX940-NO-PRELOAD-NEXT: s_load_dword s6, s[0:1], 0x8 +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x10 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NO-PRELOAD-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: global_store_short_d16_hi v0, v1, s[4:5] sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: i16_v2i8_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dword s5, s[0:1], 0x8 +; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x10 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-PRELOAD-2-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-PRELOAD-2-NEXT: global_store_short_d16_hi v0, v1, s[6:7] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: i16_v2i8_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 24 +; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 16 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX940-PRELOAD-8-NEXT: global_store_short v1, v2, s[2:3] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: global_store_short v1, v0, s[6:7] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: s_endpgm +; +; GFX90a-NO-PRELOAD-LABEL: i16_v2i8_kernel_preload_arg: +; GFX90a-NO-PRELOAD: ; %bb.0: +; GFX90a-NO-PRELOAD-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x10 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s6 +; GFX90a-NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1] +; GFX90a-NO-PRELOAD-NEXT: global_store_short_d16_hi v0, v1, s[2:3] +; GFX90a-NO-PRELOAD-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: i16_v2i8_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8 +; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[6:7] +; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-2-NEXT: global_store_short_d16_hi v0, v1, s[0:1] +; GFX90a-PRELOAD-2-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: i16_v2i8_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 24 +; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 16 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, 0 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s8 +; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX90a-PRELOAD-8-NEXT: global_store_short v1, v2, s[6:7] +; GFX90a-PRELOAD-8-NEXT: global_store_short v1, v0, s[10:11] +; GFX90a-PRELOAD-8-NEXT: s_endpgm + store i16 %in, ptr addrspace(1) %out + store <2 x i8> %in2, ptr addrspace(1) %out2 + ret void +} + +attributes #0 = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } diff --git a/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll b/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll index 9a8d5acfbe3e9..2ce0b9eed02cb 100644 --- a/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll +++ b/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll @@ -5,28 +5,28 @@ define amdgpu_kernel void @buffers_dont_alias(ptr addrspace(8) noalias %a, ptr addrspace(8) noalias %b) { ; SDAG-LABEL: buffers_dont_alias: ; SDAG: ; %bb.0: -; SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: v_mul_f32_e32 v0, v0, v0 ; SDAG-NEXT: v_mul_f32_e32 v1, v1, v1 ; SDAG-NEXT: v_mul_f32_e32 v2, v2, v2 ; SDAG-NEXT: v_mul_f32_e32 v3, v3, v3 -; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: buffers_dont_alias: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: v_mul_f32_e32 v0, v0, v0 ; GISEL-NEXT: v_mul_f32_e32 v1, v1, v1 ; GISEL-NEXT: v_mul_f32_e32 v2, v2, v2 ; GISEL-NEXT: v_mul_f32_e32 v3, v3, v3 -; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GISEL-NEXT: s_endpgm %l0 = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %a, i32 0, i32 0, i32 0) %s0 = fmul float %l0, %l0 @@ -50,40 +50,40 @@ define amdgpu_kernel void @buffers_dont_alias(ptr addrspace(8) noalias %a, ptr a define amdgpu_kernel void @buffers_from_flat_dont_alias(ptr noalias %a.flat, ptr noalias %b.flat) { ; SDAG-LABEL: buffers_from_flat_dont_alias: ; SDAG: ; %bb.0: -; SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; SDAG-NEXT: s_mov_b32 s7, 0 -; SDAG-NEXT: s_mov_b32 s6, 16 +; SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; SDAG-NEXT: s_mov_b32 s3, 0 +; SDAG-NEXT: s_mov_b32 s2, 16 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: s_and_b32 s5, s1, 0xffff -; SDAG-NEXT: s_mov_b32 s4, s0 -; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; SDAG-NEXT: s_and_b32 s5, s3, 0xffff -; SDAG-NEXT: s_mov_b32 s4, s2 +; SDAG-NEXT: s_and_b32 s1, s5, 0xffff +; SDAG-NEXT: s_mov_b32 s0, s4 +; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; SDAG-NEXT: s_and_b32 s1, s7, 0xffff +; SDAG-NEXT: s_mov_b32 s0, s6 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: v_mul_f32_e32 v0, v0, v0 ; SDAG-NEXT: v_mul_f32_e32 v1, v1, v1 ; SDAG-NEXT: v_mul_f32_e32 v2, v2, v2 ; SDAG-NEXT: v_mul_f32_e32 v3, v3, v3 -; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: buffers_from_flat_dont_alias: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GISEL-NEXT: s_mov_b32 s7, 0 -; GISEL-NEXT: s_mov_b32 s6, 16 +; GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GISEL-NEXT: s_mov_b32 s3, 0 +; GISEL-NEXT: s_mov_b32 s2, 16 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: s_and_b32 s5, s1, 0xffff -; GISEL-NEXT: s_mov_b32 s4, s0 -; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; GISEL-NEXT: s_and_b32 s5, s3, 0xffff -; GISEL-NEXT: s_mov_b32 s4, s2 +; GISEL-NEXT: s_and_b32 s1, s5, 0xffff +; GISEL-NEXT: s_mov_b32 s0, s4 +; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; GISEL-NEXT: s_and_b32 s1, s7, 0xffff +; GISEL-NEXT: s_mov_b32 s0, s6 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: v_mul_f32_e32 v0, v0, v0 ; GISEL-NEXT: v_mul_f32_e32 v1, v1, v1 ; GISEL-NEXT: v_mul_f32_e32 v2, v2, v2 ; GISEL-NEXT: v_mul_f32_e32 v3, v3, v3 -; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GISEL-NEXT: s_endpgm %a = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %a.flat, i16 0, i32 16, i32 0) %b = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %b.flat, i16 0, i32 16, i32 0) @@ -110,7 +110,7 @@ define amdgpu_kernel void @buffers_from_flat_dont_alias(ptr noalias %a.flat, ptr define amdgpu_kernel void @buffers_might_alias(ptr addrspace(8) %a, ptr addrspace(8) %b) { ; SDAG-LABEL: buffers_might_alias: ; SDAG: ; %bb.0: -; SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; SDAG-NEXT: s_waitcnt vmcnt(0) @@ -132,7 +132,7 @@ define amdgpu_kernel void @buffers_might_alias(ptr addrspace(8) %a, ptr addrspac ; ; GISEL-LABEL: buffers_might_alias: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; GISEL-NEXT: s_waitcnt vmcnt(0) @@ -173,7 +173,7 @@ define amdgpu_kernel void @buffers_might_alias(ptr addrspace(8) %a, ptr addrspac define amdgpu_kernel void @independent_offsets(ptr addrspace(8) %a) { ; SDAG-LABEL: independent_offsets: ; SDAG: ; %bb.0: -; SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SDAG-NEXT: v_mov_b32_e32 v2, 1.0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -186,7 +186,7 @@ define amdgpu_kernel void @independent_offsets(ptr addrspace(8) %a) { ; ; GISEL-LABEL: independent_offsets: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GISEL-NEXT: v_mov_b32_e32 v2, 1.0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/rotl.ll b/llvm/test/CodeGen/AMDGPU/rotl.ll index fdce4431fbbf2..fcccd2da07f76 100644 --- a/llvm/test/CodeGen/AMDGPU/rotl.ll +++ b/llvm/test/CodeGen/AMDGPU/rotl.ll @@ -21,7 +21,7 @@ define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; SI-LABEL: rotl_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_sub_i32 s3, 32, s3 @@ -35,7 +35,7 @@ define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX8-LABEL: rotl_i32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_sub_i32 s3, 32, s3 ; GFX8-NEXT: v_mov_b32_e32 v0, s3 @@ -47,17 +47,17 @@ define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX10-LABEL: rotl_i32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_sub_i32 s3, 32, s3 -; GFX10-NEXT: v_alignbit_b32 v1, s2, s2, s3 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_sub_i32 s0, 32, s7 +; GFX10-NEXT: v_alignbit_b32 v1, s6, s6, s0 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: rotl_i32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_sub_i32 s3, 32, s3 @@ -95,8 +95,8 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; ; SI-LABEL: rotl_v2i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -111,8 +111,8 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; ; GFX8-LABEL: rotl_v2i32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_sub_i32 s2, 32, s6 ; GFX8-NEXT: s_sub_i32 s3, 32, s7 @@ -128,22 +128,22 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX10-LABEL: rotl_v2i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_sub_i32 s2, 32, s7 -; GFX10-NEXT: s_sub_i32 s3, 32, s6 -; GFX10-NEXT: v_alignbit_b32 v1, s5, s5, s2 -; GFX10-NEXT: v_alignbit_b32 v0, s4, s4, s3 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: s_sub_i32 s0, 32, s7 +; GFX10-NEXT: s_sub_i32 s1, 32, s6 +; GFX10-NEXT: v_alignbit_b32 v1, s5, s5, s0 +; GFX10-NEXT: v_alignbit_b32 v0, s4, s4, s1 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: rotl_v2i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_sub_i32 s2, 32, s7 @@ -188,8 +188,8 @@ define amdgpu_kernel void @rotl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; ; SI-LABEL: rotl_v4i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -210,8 +210,8 @@ define amdgpu_kernel void @rotl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; ; GFX8-LABEL: rotl_v4i32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_sub_i32 s3, 32, s9 ; GFX8-NEXT: s_sub_i32 s9, 32, s11 @@ -233,26 +233,26 @@ define amdgpu_kernel void @rotl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX10-LABEL: rotl_v4i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_sub_i32 s2, 32, s8 -; GFX10-NEXT: s_sub_i32 s3, 32, s9 +; GFX10-NEXT: s_sub_i32 s0, 32, s8 +; GFX10-NEXT: s_sub_i32 s1, 32, s9 ; GFX10-NEXT: s_sub_i32 s8, 32, s11 ; GFX10-NEXT: s_sub_i32 s9, 32, s10 ; GFX10-NEXT: v_alignbit_b32 v3, s7, s7, s8 ; GFX10-NEXT: v_alignbit_b32 v2, s6, s6, s9 -; GFX10-NEXT: v_alignbit_b32 v1, s5, s5, s3 -; GFX10-NEXT: v_alignbit_b32 v0, s4, s4, s2 -; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX10-NEXT: v_alignbit_b32 v1, s5, s5, s1 +; GFX10-NEXT: v_alignbit_b32 v0, s4, s4, s0 +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: rotl_v4i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_sub_i32 s2, 32, s8 diff --git a/llvm/test/CodeGen/AMDGPU/rotr.ll b/llvm/test/CodeGen/AMDGPU/rotr.ll index 0e1dd69d930ae..214894092a8b0 100644 --- a/llvm/test/CodeGen/AMDGPU/rotr.ll +++ b/llvm/test/CodeGen/AMDGPU/rotr.ll @@ -19,7 +19,7 @@ define amdgpu_kernel void @rotr_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; SI-LABEL: rotr_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -32,7 +32,7 @@ define amdgpu_kernel void @rotr_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX8-LABEL: rotr_i32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s3 ; GFX8-NEXT: v_alignbit_b32 v2, s2, s2, v0 @@ -43,16 +43,16 @@ define amdgpu_kernel void @rotr_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX10-LABEL: rotr_i32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v1, s2, s2, s3 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: v_alignbit_b32 v1, s6, s6, s7 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: rotr_i32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_alignbit_b32 v1, s2, s2, s3 @@ -84,8 +84,8 @@ define amdgpu_kernel void @rotr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; ; SI-LABEL: rotr_v2i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -98,8 +98,8 @@ define amdgpu_kernel void @rotr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; ; GFX8-LABEL: rotr_v2i32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s7 ; GFX8-NEXT: v_mov_b32_e32 v2, s6 @@ -113,20 +113,20 @@ define amdgpu_kernel void @rotr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX10-LABEL: rotr_v2i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_alignbit_b32 v1, s5, s5, s7 ; GFX10-NEXT: v_alignbit_b32 v0, s4, s4, s6 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: rotr_v2i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_alignbit_b32 v1, s5, s5, s7 @@ -161,8 +161,8 @@ define amdgpu_kernel void @rotr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; ; SI-LABEL: rotr_v4i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -179,8 +179,8 @@ define amdgpu_kernel void @rotr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; ; GFX8-LABEL: rotr_v4i32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s11 ; GFX8-NEXT: v_mov_b32_e32 v1, s10 @@ -198,22 +198,22 @@ define amdgpu_kernel void @rotr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX10-LABEL: rotr_v4i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_alignbit_b32 v3, s7, s7, s11 ; GFX10-NEXT: v_alignbit_b32 v2, s6, s6, s10 ; GFX10-NEXT: v_alignbit_b32 v1, s5, s5, s9 ; GFX10-NEXT: v_alignbit_b32 v0, s4, s4, s8 -; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: rotr_v4i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_alignbit_b32 v3, s7, s7, s11 diff --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll index 8c663d963b73e..acdcb631dccbd 100644 --- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll @@ -8,20 +8,20 @@ define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 { ; GFX9-LABEL: s_shl_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 -; GFX9-NEXT: v_pk_lshlrev_b16 v0, s3, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: v_pk_lshlrev_b16 v0, s7, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: s_shl_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -40,7 +40,7 @@ define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 ; ; CI-LABEL: s_shl_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -59,19 +59,19 @@ define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 ; ; GFX10-LABEL: s_shl_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_lshlrev_b16 v0, s3, s2 -; GFX10-NEXT: s_mov_b32 s4, s0 -; GFX10-NEXT: s_mov_b32 s5, s1 -; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX10-NEXT: v_pk_lshlrev_b16 v0, s7, s6 +; GFX10-NEXT: s_mov_b32 s0, s4 +; GFX10-NEXT: s_mov_b32 s1, s5 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_shl_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -90,7 +90,7 @@ define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 define amdgpu_kernel void @v_shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: v_shl_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -101,7 +101,7 @@ define amdgpu_kernel void @v_shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-LABEL: v_shl_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -120,7 +120,7 @@ define amdgpu_kernel void @v_shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; CI-LABEL: v_shl_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -142,7 +142,7 @@ define amdgpu_kernel void @v_shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX10-LABEL: v_shl_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -153,9 +153,7 @@ define amdgpu_kernel void @v_shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX11-LABEL: v_shl_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] @@ -180,20 +178,20 @@ define amdgpu_kernel void @v_shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @shl_v_s_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in, <2 x i16> %sgpr) #0 { ; GFX9-LABEL: shl_v_s_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshlrev_b16 v1, s0, v1 +; GFX9-NEXT: v_pk_lshlrev_b16 v1, s2, v1 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: shl_v_s_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -214,21 +212,21 @@ define amdgpu_kernel void @shl_v_s_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; CI-LABEL: shl_v_s_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dword s0, s[2:3], 0xd -; CI-NEXT: s_mov_b32 s11, 0xf000 -; CI-NEXT: s_mov_b32 s10, 0 +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dword s8, s[0:1], 0xd +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[8:9], s[6:7] +; CI-NEXT: s_mov_b64 s[0:1], s[6:7] ; CI-NEXT: v_mov_b32_e32 v1, 0 -; CI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; CI-NEXT: s_lshr_b32 s1, s0, 16 -; CI-NEXT: s_mov_b64 s[6:7], s[10:11] +; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 +; CI-NEXT: s_lshr_b32 s0, s8, 16 +; CI-NEXT: s_mov_b64 s[6:7], s[2:3] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; CI-NEXT: v_lshlrev_b32_e32 v2, s0, v2 -; CI-NEXT: v_lshlrev_b32_e32 v3, s1, v3 +; CI-NEXT: v_lshlrev_b32_e32 v2, s8, v2 +; CI-NEXT: v_lshlrev_b32_e32 v3, s0, v3 ; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 @@ -237,10 +235,9 @@ define amdgpu_kernel void @shl_v_s_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX10-LABEL: shl_v_s_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -250,12 +247,9 @@ define amdgpu_kernel void @shl_v_s_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX11-LABEL: shl_v_s_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -277,20 +271,20 @@ define amdgpu_kernel void @shl_v_s_v2i16(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @shl_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in, <2 x i16> %sgpr) #0 { ; GFX9-LABEL: shl_s_v_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshlrev_b16 v1, v1, s0 +; GFX9-NEXT: v_pk_lshlrev_b16 v1, v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: shl_s_v_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -311,21 +305,21 @@ define amdgpu_kernel void @shl_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; CI-LABEL: shl_s_v_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dword s0, s[2:3], 0xd -; CI-NEXT: s_mov_b32 s11, 0xf000 -; CI-NEXT: s_mov_b32 s10, 0 +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dword s8, s[0:1], 0xd +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[8:9], s[6:7] +; CI-NEXT: s_mov_b64 s[0:1], s[6:7] ; CI-NEXT: v_mov_b32_e32 v1, 0 -; CI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; CI-NEXT: s_lshr_b32 s1, s0, 16 -; CI-NEXT: s_mov_b64 s[6:7], s[10:11] +; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 +; CI-NEXT: s_lshr_b32 s0, s8, 16 +; CI-NEXT: s_mov_b64 s[6:7], s[2:3] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; CI-NEXT: v_lshl_b32_e32 v2, s0, v2 -; CI-NEXT: v_lshl_b32_e32 v3, s1, v3 +; CI-NEXT: v_lshl_b32_e32 v2, s8, v2 +; CI-NEXT: v_lshl_b32_e32 v3, s0, v3 ; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 @@ -334,10 +328,9 @@ define amdgpu_kernel void @shl_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX10-LABEL: shl_s_v_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -347,12 +340,9 @@ define amdgpu_kernel void @shl_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX11-LABEL: shl_s_v_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -374,7 +364,7 @@ define amdgpu_kernel void @shl_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @shl_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: shl_imm_v_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -385,7 +375,7 @@ define amdgpu_kernel void @shl_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: shl_imm_v_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: v_mov_b32_e32 v4, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -405,7 +395,7 @@ define amdgpu_kernel void @shl_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace( ; ; CI-LABEL: shl_imm_v_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -426,7 +416,7 @@ define amdgpu_kernel void @shl_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10-LABEL: shl_imm_v_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -437,9 +427,7 @@ define amdgpu_kernel void @shl_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11-LABEL: shl_imm_v_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -462,7 +450,7 @@ define amdgpu_kernel void @shl_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @shl_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: shl_v_imm_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -473,7 +461,7 @@ define amdgpu_kernel void @shl_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: shl_v_imm_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -493,7 +481,7 @@ define amdgpu_kernel void @shl_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace( ; ; CI-LABEL: shl_v_imm_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -510,7 +498,7 @@ define amdgpu_kernel void @shl_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10-LABEL: shl_v_imm_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -521,9 +509,7 @@ define amdgpu_kernel void @shl_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11-LABEL: shl_v_imm_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -546,7 +532,7 @@ define amdgpu_kernel void @shl_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @v_shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: v_shl_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] @@ -558,7 +544,7 @@ define amdgpu_kernel void @v_shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-LABEL: v_shl_v4i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -580,7 +566,7 @@ define amdgpu_kernel void @v_shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; CI-LABEL: v_shl_v4i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 @@ -609,7 +595,7 @@ define amdgpu_kernel void @v_shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX10-LABEL: v_shl_v4i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] @@ -621,9 +607,7 @@ define amdgpu_kernel void @v_shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX11-LABEL: v_shl_v4i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] @@ -649,7 +633,7 @@ define amdgpu_kernel void @v_shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @shl_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: shl_v_imm_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -661,7 +645,7 @@ define amdgpu_kernel void @shl_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: shl_v_imm_v4i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -685,7 +669,7 @@ define amdgpu_kernel void @shl_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace( ; ; CI-LABEL: shl_v_imm_v4i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -708,7 +692,7 @@ define amdgpu_kernel void @shl_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10-LABEL: shl_v_imm_v4i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -720,9 +704,7 @@ define amdgpu_kernel void @shl_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11-LABEL: shl_v_imm_v4i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/sign_extend.ll b/llvm/test/CodeGen/AMDGPU/sign_extend.ll index b54df3b4d0c6c..9a03d216c7a99 100644 --- a/llvm/test/CodeGen/AMDGPU/sign_extend.ll +++ b/llvm/test/CodeGen/AMDGPU/sign_extend.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @s_sext_i1_to_i32(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind { ; SI-LABEL: s_sext_i1_to_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -19,7 +19,7 @@ define amdgpu_kernel void @s_sext_i1_to_i32(ptr addrspace(1) %out, i32 %a, i32 % ; ; VI-LABEL: s_sext_i1_to_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -39,13 +39,15 @@ define amdgpu_kernel void @s_sext_i1_to_i32(ptr addrspace(1) %out, i32 %a, i32 % define amdgpu_kernel void @test_s_sext_i32_to_i64(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) nounwind { ; SI-LABEL: test_s_sext_i32_to_i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mul_i32 s4, s4, s5 -; SI-NEXT: s_add_i32 s4, s4, s6 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mul_i32 s4, s6, s7 +; SI-NEXT: s_add_i32 s4, s4, s8 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_ashr_i32 s5, s4, 31 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -54,13 +56,15 @@ define amdgpu_kernel void @test_s_sext_i32_to_i64(ptr addrspace(1) %out, i32 %a, ; ; VI-LABEL: test_s_sext_i32_to_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mul_i32 s4, s4, s5 -; VI-NEXT: s_add_i32 s4, s4, s6 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mul_i32 s4, s6, s7 +; VI-NEXT: s_add_i32 s4, s4, s8 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_ashr_i32 s5, s4, 31 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -77,7 +81,7 @@ entry: define amdgpu_kernel void @s_sext_i1_to_i64(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind { ; SI-LABEL: s_sext_i1_to_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -92,7 +96,7 @@ define amdgpu_kernel void @s_sext_i1_to_i64(ptr addrspace(1) %out, i32 %a, i32 % ; ; VI-LABEL: s_sext_i1_to_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -113,8 +117,8 @@ define amdgpu_kernel void @s_sext_i1_to_i64(ptr addrspace(1) %out, i32 %a, i32 % define amdgpu_kernel void @s_sext_i32_to_i64(ptr addrspace(1) %out, i32 %a) nounwind { ; SI-LABEL: s_sext_i32_to_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -126,8 +130,8 @@ define amdgpu_kernel void @s_sext_i32_to_i64(ptr addrspace(1) %out, i32 %a) noun ; ; VI-LABEL: s_sext_i32_to_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -144,7 +148,7 @@ define amdgpu_kernel void @s_sext_i32_to_i64(ptr addrspace(1) %out, i32 %a) noun define amdgpu_kernel void @v_sext_i32_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: v_sext_i32_to_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -162,7 +166,7 @@ define amdgpu_kernel void @v_sext_i32_to_i64(ptr addrspace(1) %out, ptr addrspac ; ; VI-LABEL: v_sext_i32_to_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -186,8 +190,8 @@ define amdgpu_kernel void @v_sext_i32_to_i64(ptr addrspace(1) %out, ptr addrspac define amdgpu_kernel void @s_sext_i16_to_i64(ptr addrspace(1) %out, i16 %a) nounwind { ; SI-LABEL: s_sext_i16_to_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -199,8 +203,8 @@ define amdgpu_kernel void @s_sext_i16_to_i64(ptr addrspace(1) %out, i16 %a) noun ; ; VI-LABEL: s_sext_i16_to_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -217,7 +221,7 @@ define amdgpu_kernel void @s_sext_i16_to_i64(ptr addrspace(1) %out, i16 %a) noun define amdgpu_kernel void @s_sext_i1_to_i16(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind { ; SI-LABEL: s_sext_i1_to_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -231,7 +235,7 @@ define amdgpu_kernel void @s_sext_i1_to_i16(ptr addrspace(1) %out, i32 %a, i32 % ; ; VI-LABEL: s_sext_i1_to_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -255,8 +259,8 @@ define amdgpu_kernel void @s_sext_i1_to_i16(ptr addrspace(1) %out, i32 %a, i32 % define amdgpu_kernel void @s_sext_i1_to_i16_with_and(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c, i32 %d) nounwind { ; SI-LABEL: s_sext_i1_to_i16_with_and: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -271,8 +275,8 @@ define amdgpu_kernel void @s_sext_i1_to_i16_with_and(ptr addrspace(1) %out, i32 ; ; VI-LABEL: s_sext_i1_to_i16_with_and: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -295,13 +299,15 @@ define amdgpu_kernel void @s_sext_i1_to_i16_with_and(ptr addrspace(1) %out, i32 define amdgpu_kernel void @v_sext_i1_to_i16_with_and(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) nounwind { ; SI-LABEL: v_sext_i1_to_i16_with_and: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_eq_u32 s5, s6 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_cmp_eq_u32 s7, s8 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v0 ; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 ; SI-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] @@ -310,13 +316,15 @@ define amdgpu_kernel void @v_sext_i1_to_i16_with_and(ptr addrspace(1) %out, i32 ; ; VI-LABEL: v_sext_i1_to_i16_with_and: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_eq_u32 s5, s6 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_cmp_eq_u32 s7, s8 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v0 ; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 ; VI-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] @@ -342,8 +350,8 @@ define amdgpu_kernel void @v_sext_i1_to_i16_with_and(ptr addrspace(1) %out, i32 define amdgpu_kernel void @s_sext_v4i8_to_v4i32(ptr addrspace(1) %out, i32 %a) nounwind { ; SI-LABEL: s_sext_v4i8_to_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -367,8 +375,8 @@ define amdgpu_kernel void @s_sext_v4i8_to_v4i32(ptr addrspace(1) %out, i32 %a) n ; ; VI-LABEL: s_sext_v4i8_to_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -407,7 +415,7 @@ define amdgpu_kernel void @s_sext_v4i8_to_v4i32(ptr addrspace(1) %out, i32 %a) n define amdgpu_kernel void @v_sext_v4i8_to_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: v_sext_v4i8_to_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -435,7 +443,7 @@ define amdgpu_kernel void @v_sext_v4i8_to_v4i32(ptr addrspace(1) %out, ptr addrs ; ; VI-LABEL: v_sext_v4i8_to_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -479,7 +487,7 @@ define amdgpu_kernel void @v_sext_v4i8_to_v4i32(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @s_sext_v4i16_to_v4i32(ptr addrspace(1) %out, i64 %a) nounwind { ; SI-LABEL: s_sext_v4i16_to_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -505,7 +513,7 @@ define amdgpu_kernel void @s_sext_v4i16_to_v4i32(ptr addrspace(1) %out, i64 %a) ; ; VI-LABEL: s_sext_v4i16_to_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -544,7 +552,7 @@ define amdgpu_kernel void @s_sext_v4i16_to_v4i32(ptr addrspace(1) %out, i64 %a) define amdgpu_kernel void @v_sext_v4i16_to_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: v_sext_v4i16_to_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -572,7 +580,7 @@ define amdgpu_kernel void @v_sext_v4i16_to_v4i32(ptr addrspace(1) %out, ptr addr ; ; VI-LABEL: v_sext_v4i16_to_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 diff --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll index 3644bef9c20a1..f88aaf389ca9a 100644 --- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll +++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll @@ -50,12 +50,12 @@ define void @local_store_i56(ptr addrspace(3) %ptr, i56 %arg) #0 { define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 { ; HAWAII-LABEL: local_store_i55: ; HAWAII: ; %bb.0: -; HAWAII-NEXT: s_or_b32 s0, s6, 14 +; HAWAII-NEXT: s_or_b32 s0, s4, 14 ; HAWAII-NEXT: v_mov_b32_e32 v0, s0 -; HAWAII-NEXT: v_mov_b32_e32 v1, s7 +; HAWAII-NEXT: v_mov_b32_e32 v1, s5 ; HAWAII-NEXT: flat_load_ubyte v0, v[0:1] -; HAWAII-NEXT: s_load_dword s2, s[6:7], 0x0 -; HAWAII-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 +; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x0 +; HAWAII-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; HAWAII-NEXT: s_mov_b32 m0, -1 ; HAWAII-NEXT: s_waitcnt lgkmcnt(0) ; HAWAII-NEXT: v_mov_b32_e32 v1, s2 @@ -70,12 +70,12 @@ define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 { ; ; FIJI-LABEL: local_store_i55: ; FIJI: ; %bb.0: -; FIJI-NEXT: s_or_b32 s0, s6, 14 +; FIJI-NEXT: s_or_b32 s0, s4, 14 ; FIJI-NEXT: v_mov_b32_e32 v0, s0 -; FIJI-NEXT: v_mov_b32_e32 v1, s7 +; FIJI-NEXT: v_mov_b32_e32 v1, s5 ; FIJI-NEXT: flat_load_ubyte v0, v[0:1] -; FIJI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 -; FIJI-NEXT: s_load_dword s2, s[6:7], 0x0 +; FIJI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; FIJI-NEXT: s_load_dword s2, s[4:5], 0x0 ; FIJI-NEXT: s_mov_b32 m0, -1 ; FIJI-NEXT: s_waitcnt lgkmcnt(0) ; FIJI-NEXT: s_and_b32 s3, s1, 0xffff @@ -94,9 +94,9 @@ define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 { ; GFX9-LABEL: local_store_i55: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_load_ubyte_d16_hi v0, v0, s[6:7] offset:14 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX9-NEXT: global_load_ubyte_d16_hi v0, v0, s[4:5] offset:14 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s3, s1, 0xffff ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -114,9 +114,9 @@ define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 { ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX10-NEXT: global_load_ubyte_d16_hi v0, v0, s[6:7] offset:14 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-NEXT: global_load_ubyte_d16_hi v0, v0, s[4:5] offset:14 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s3, s1, 0xffff ; GFX10-NEXT: v_mov_b32_e32 v1, s2 @@ -133,16 +133,16 @@ define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 { ; GFX11-LABEL: local_store_i55: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: global_load_d16_hi_u8 v0, v0, s[2:3] offset:14 +; GFX11-NEXT: global_load_d16_hi_u8 v0, v0, s[0:1] offset:14 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 -; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s3, s1, 0xffff -; GFX11-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s1 -; GFX11-NEXT: v_mov_b32_e32 v3, s0 +; GFX11-NEXT: s_and_b32 s1, s3, 0xffff +; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s3 +; GFX11-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_or_b32_e32 v0, s3, v0 +; GFX11-NEXT: v_or_b32_e32 v0, s1, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v0, 0x7fffff, v0 ; GFX11-NEXT: ds_store_b8_d16_hi v1, v0 offset:6 @@ -156,8 +156,8 @@ define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 { define amdgpu_kernel void @local_store_i48(ptr addrspace(3) %ptr, i48 %arg) #0 { ; HAWAII-LABEL: local_store_i48: ; HAWAII: ; %bb.0: -; HAWAII-NEXT: s_load_dword s2, s[6:7], 0x0 -; HAWAII-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 +; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x0 +; HAWAII-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; HAWAII-NEXT: s_mov_b32 m0, -1 ; HAWAII-NEXT: s_waitcnt lgkmcnt(0) ; HAWAII-NEXT: v_mov_b32_e32 v0, s2 @@ -169,8 +169,8 @@ define amdgpu_kernel void @local_store_i48(ptr addrspace(3) %ptr, i48 %arg) #0 { ; ; FIJI-LABEL: local_store_i48: ; FIJI: ; %bb.0: -; FIJI-NEXT: s_load_dword s2, s[6:7], 0x0 -; FIJI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; FIJI-NEXT: s_load_dword s2, s[4:5], 0x0 +; FIJI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; FIJI-NEXT: s_mov_b32 m0, -1 ; FIJI-NEXT: s_waitcnt lgkmcnt(0) ; FIJI-NEXT: v_mov_b32_e32 v0, s2 @@ -182,8 +182,8 @@ define amdgpu_kernel void @local_store_i48(ptr addrspace(3) %ptr, i48 %arg) #0 { ; ; GFX9-LABEL: local_store_i48: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -195,8 +195,8 @@ define amdgpu_kernel void @local_store_i48(ptr addrspace(3) %ptr, i48 %arg) #0 { ; GFX10-LABEL: local_store_i48: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 @@ -208,10 +208,10 @@ define amdgpu_kernel void @local_store_i48(ptr addrspace(3) %ptr, i48 %arg) #0 { ; GFX11-LABEL: local_store_i48: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-NEXT: ds_store_b16 v0, v1 offset:4 ; GFX11-NEXT: ds_store_b32 v0, v2 @@ -223,9 +223,9 @@ define amdgpu_kernel void @local_store_i48(ptr addrspace(3) %ptr, i48 %arg) #0 { define amdgpu_kernel void @local_store_i65(ptr addrspace(3) %ptr, i65 %arg) #0 { ; HAWAII-LABEL: local_store_i65: ; HAWAII: ; %bb.0: -; HAWAII-NEXT: s_load_dword s2, s[6:7], 0x4 -; HAWAII-NEXT: s_load_dword s3, s[6:7], 0x0 -; HAWAII-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 +; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x4 +; HAWAII-NEXT: s_load_dword s3, s[4:5], 0x0 +; HAWAII-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; HAWAII-NEXT: s_mov_b32 m0, -1 ; HAWAII-NEXT: s_waitcnt lgkmcnt(0) ; HAWAII-NEXT: s_and_b32 s2, s2, 1 @@ -239,9 +239,9 @@ define amdgpu_kernel void @local_store_i65(ptr addrspace(3) %ptr, i65 %arg) #0 { ; ; FIJI-LABEL: local_store_i65: ; FIJI: ; %bb.0: -; FIJI-NEXT: s_load_dword s2, s[6:7], 0x10 -; FIJI-NEXT: s_load_dword s3, s[6:7], 0x0 -; FIJI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; FIJI-NEXT: s_load_dword s2, s[4:5], 0x10 +; FIJI-NEXT: s_load_dword s3, s[4:5], 0x0 +; FIJI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; FIJI-NEXT: s_mov_b32 m0, -1 ; FIJI-NEXT: s_waitcnt lgkmcnt(0) ; FIJI-NEXT: s_and_b32 s2, s2, 1 @@ -255,9 +255,9 @@ define amdgpu_kernel void @local_store_i65(ptr addrspace(3) %ptr, i65 %arg) #0 { ; ; GFX9-LABEL: local_store_i65: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x10 -; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x10 +; GFX9-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 1 ; GFX9-NEXT: v_mov_b32_e32 v2, s3 @@ -271,9 +271,9 @@ define amdgpu_kernel void @local_store_i65(ptr addrspace(3) %ptr, i65 %arg) #0 { ; GFX10-LABEL: local_store_i65: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x10 -; GFX10-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x10 +; GFX10-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s2, s2, 1 ; GFX10-NEXT: v_mov_b32_e32 v2, s3 @@ -287,13 +287,13 @@ define amdgpu_kernel void @local_store_i65(ptr addrspace(3) %ptr, i65 %arg) #0 { ; GFX11-LABEL: local_store_i65: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x10 -; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x10 +; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 1 +; GFX11-NEXT: s_and_b32 s2, s2, 1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, s2 +; GFX11-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, s2 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: ds_store_b8 v2, v3 offset:8 ; GFX11-NEXT: ds_store_b64 v2, v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/sub.ll b/llvm/test/CodeGen/AMDGPU/sub.ll index 6044873563254..3ae982089228d 100644 --- a/llvm/test/CodeGen/AMDGPU/sub.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.ll @@ -9,7 +9,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone speculatable define amdgpu_kernel void @s_sub_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; GFX6-LABEL: s_sub_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -22,7 +22,7 @@ define amdgpu_kernel void @s_sub_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; ; GFX8-LABEL: s_sub_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_sub_i32 s2, s2, s3 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -33,17 +33,17 @@ define amdgpu_kernel void @s_sub_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; ; GFX9-LABEL: s_sub_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sub_i32 s2, s2, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_sub_i32 s0, s6, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: s_sub_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_sub_co_i32 s2, s2, s3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -60,8 +60,8 @@ define amdgpu_kernel void @s_sub_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { define amdgpu_kernel void @s_sub_imm_i32(ptr addrspace(1) %out, i32 %a) { ; GFX6-LABEL: s_sub_imm_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -72,10 +72,10 @@ define amdgpu_kernel void @s_sub_imm_i32(ptr addrspace(1) %out, i32 %a) { ; ; GFX8-LABEL: s_sub_imm_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sub_i32 s2, 0x4d2, s4 +; GFX8-NEXT: s_sub_i32 s2, 0x4d2, s2 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 @@ -84,18 +84,18 @@ define amdgpu_kernel void @s_sub_imm_i32(ptr addrspace(1) %out, i32 %a) { ; ; GFX9-LABEL: s_sub_imm_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sub_i32 s2, 0x4d2, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_sub_i32 s0, 0x4d2, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: s_sub_imm_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_sub_co_i32 s2, 0x4d2, s2 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -112,7 +112,7 @@ define amdgpu_kernel void @s_sub_imm_i32(ptr addrspace(1) %out, i32 %a) { define amdgpu_kernel void @test_sub_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: test_sub_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -130,7 +130,7 @@ define amdgpu_kernel void @test_sub_i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX8-LABEL: test_sub_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -144,7 +144,7 @@ define amdgpu_kernel void @test_sub_i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: test_sub_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -155,7 +155,7 @@ define amdgpu_kernel void @test_sub_i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX12-LABEL: test_sub_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[2:3] @@ -176,7 +176,7 @@ define amdgpu_kernel void @test_sub_i32(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @test_sub_imm_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: test_sub_imm_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -194,7 +194,7 @@ define amdgpu_kernel void @test_sub_imm_i32(ptr addrspace(1) %out, ptr addrspace ; ; GFX8-LABEL: test_sub_imm_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -208,7 +208,7 @@ define amdgpu_kernel void @test_sub_imm_i32(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-LABEL: test_sub_imm_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -219,7 +219,7 @@ define amdgpu_kernel void @test_sub_imm_i32(ptr addrspace(1) %out, ptr addrspace ; ; GFX12-LABEL: test_sub_imm_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -238,7 +238,7 @@ define amdgpu_kernel void @test_sub_imm_i32(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @test_sub_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: test_sub_v2i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -257,7 +257,7 @@ define amdgpu_kernel void @test_sub_v2i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX8-LABEL: test_sub_v2i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -272,7 +272,7 @@ define amdgpu_kernel void @test_sub_v2i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX9-LABEL: test_sub_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] @@ -284,7 +284,7 @@ define amdgpu_kernel void @test_sub_v2i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX12-LABEL: test_sub_v2i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b128 v[0:3], v4, s[2:3] @@ -306,7 +306,7 @@ define amdgpu_kernel void @test_sub_v2i32(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @test_sub_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: test_sub_v4i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -328,7 +328,7 @@ define amdgpu_kernel void @test_sub_v4i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX8-LABEL: test_sub_v4i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -350,7 +350,7 @@ define amdgpu_kernel void @test_sub_v4i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX9-LABEL: test_sub_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] offset:16 @@ -365,7 +365,7 @@ define amdgpu_kernel void @test_sub_v4i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX12-LABEL: test_sub_v4i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v8, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 @@ -391,7 +391,7 @@ define amdgpu_kernel void @test_sub_v4i32(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @test_sub_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: test_sub_i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s10, 0 ; GFX6-NEXT: s_mov_b32 s11, s7 @@ -412,7 +412,7 @@ define amdgpu_kernel void @test_sub_i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX8-LABEL: test_sub_i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -432,7 +432,7 @@ define amdgpu_kernel void @test_sub_i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: test_sub_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc @@ -446,11 +446,9 @@ define amdgpu_kernel void @test_sub_i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX12-LABEL: test_sub_i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -474,7 +472,7 @@ define amdgpu_kernel void @test_sub_i16(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: test_sub_v2i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s10, 0 ; GFX6-NEXT: s_mov_b32 s11, s7 @@ -499,7 +497,7 @@ define amdgpu_kernel void @test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX8-LABEL: test_sub_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -517,7 +515,7 @@ define amdgpu_kernel void @test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX9-LABEL: test_sub_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -529,11 +527,9 @@ define amdgpu_kernel void @test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX12-LABEL: test_sub_v2i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -555,7 +551,7 @@ define amdgpu_kernel void @test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @test_sub_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: test_sub_v4i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s10, 0 ; GFX6-NEXT: s_mov_b32 s11, s7 @@ -587,7 +583,7 @@ define amdgpu_kernel void @test_sub_v4i16(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX8-LABEL: test_sub_v4i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -608,7 +604,7 @@ define amdgpu_kernel void @test_sub_v4i16(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX9-LABEL: test_sub_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -621,11 +617,9 @@ define amdgpu_kernel void @test_sub_v4i16(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX12-LABEL: test_sub_v4i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b128 v[0:3], v0, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -648,8 +642,8 @@ define amdgpu_kernel void @test_sub_v4i16(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @s_sub_i64(ptr addrspace(1) noalias %out, i64 %a, i64 %b) nounwind { ; GFX6-LABEL: s_sub_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -662,8 +656,8 @@ define amdgpu_kernel void @s_sub_i64(ptr addrspace(1) noalias %out, i64 %a, i64 ; ; GFX8-LABEL: s_sub_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_sub_u32 s2, s4, s6 ; GFX8-NEXT: s_subb_u32 s3, s5, s7 @@ -676,22 +670,22 @@ define amdgpu_kernel void @s_sub_i64(ptr addrspace(1) noalias %out, i64 %a, i64 ; ; GFX9-LABEL: s_sub_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sub_u32 s2, s4, s6 -; GFX9-NEXT: s_subb_u32 s3, s5, s7 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_sub_u32 s0, s4, s6 +; GFX9-NEXT: s_subb_u32 s1, s5, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: s_sub_i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_sub_nc_u64 s[2:3], s[4:5], s[6:7] ; GFX12-NEXT: v_mov_b32_e32 v2, 0 @@ -708,8 +702,8 @@ define amdgpu_kernel void @s_sub_i64(ptr addrspace(1) noalias %out, i64 %a, i64 define amdgpu_kernel void @v_sub_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %inA, ptr addrspace(1) noalias %inB) nounwind { ; GFX6-LABEL: v_sub_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX6-NEXT: s_mov_b32 s11, 0xf000 ; GFX6-NEXT: s_mov_b32 s14, 0 ; GFX6-NEXT: s_mov_b32 s15, s11 @@ -731,8 +725,8 @@ define amdgpu_kernel void @v_sub_i64(ptr addrspace(1) noalias %out, ptr addrspac ; ; GFX8-LABEL: v_sub_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s7 @@ -753,12 +747,12 @@ define amdgpu_kernel void @v_sub_i64(ptr addrspace(1) noalias %out, ptr addrspac ; ; GFX9-LABEL: v_sub_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2 @@ -769,12 +763,10 @@ define amdgpu_kernel void @v_sub_i64(ptr addrspace(1) noalias %out, ptr addrspac ; GFX12-LABEL: v_sub_i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[6:7] @@ -799,8 +791,8 @@ define amdgpu_kernel void @v_sub_i64(ptr addrspace(1) noalias %out, ptr addrspac define amdgpu_kernel void @v_test_sub_v2i64(ptr addrspace(1) %out, ptr addrspace(1) noalias %inA, ptr addrspace(1) noalias %inB) { ; GFX6-LABEL: v_test_sub_v2i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX6-NEXT: s_mov_b32 s11, 0xf000 ; GFX6-NEXT: s_mov_b32 s14, 0 ; GFX6-NEXT: s_mov_b32 s15, s11 @@ -824,8 +816,8 @@ define amdgpu_kernel void @v_test_sub_v2i64(ptr addrspace(1) %out, ptr addrspace ; ; GFX8-LABEL: v_test_sub_v2i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s7 @@ -848,12 +840,12 @@ define amdgpu_kernel void @v_test_sub_v2i64(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-LABEL: v_test_sub_v2i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 4, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] -; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1] +; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v6 @@ -866,12 +858,10 @@ define amdgpu_kernel void @v_test_sub_v2i64(ptr addrspace(1) %out, ptr addrspace ; GFX12-LABEL: v_test_sub_v2i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: v_mov_b32_e32 v8, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX12-NEXT: v_mov_b32_e32 v8, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_load_b128 v[0:3], v4, s[6:7] @@ -898,8 +888,8 @@ define amdgpu_kernel void @v_test_sub_v2i64(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @v_test_sub_v4i64(ptr addrspace(1) %out, ptr addrspace(1) noalias %inA, ptr addrspace(1) noalias %inB) { ; GFX6-LABEL: v_test_sub_v4i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX6-NEXT: s_mov_b32 s11, 0xf000 ; GFX6-NEXT: s_mov_b32 s14, 0 ; GFX6-NEXT: s_mov_b32 s15, s11 @@ -931,8 +921,8 @@ define amdgpu_kernel void @v_test_sub_v4i64(ptr addrspace(1) %out, ptr addrspace ; ; GFX8-LABEL: v_test_sub_v4i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 5, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s7 @@ -971,14 +961,14 @@ define amdgpu_kernel void @v_test_sub_v4i64(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-LABEL: v_test_sub_v4i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v16, 5, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] -; GFX9-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] +; GFX9-NEXT: global_load_dwordx4 v[4:7], v16, s[2:3] ; GFX9-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:16 -; GFX9-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:16 +; GFX9-NEXT: global_load_dwordx4 v[12:15], v16, s[2:3] offset:16 ; GFX9-NEXT: v_mov_b32_e32 v16, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v6 @@ -997,12 +987,10 @@ define amdgpu_kernel void @v_test_sub_v4i64(ptr addrspace(1) %out, ptr addrspace ; GFX12-LABEL: v_test_sub_v4i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: v_mov_b32_e32 v16, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_lshlrev_b32_e32 v12, 5, v0 +; GFX12-NEXT: v_mov_b32_e32 v16, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x3 ; GFX12-NEXT: global_load_b128 v[0:3], v12, s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll index fe234a82ba6f7..bfeab97d81dbe 100644 --- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll @@ -8,13 +8,13 @@ define amdgpu_kernel void @v_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; GFX9-LABEL: v_test_sub_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 @@ -24,8 +24,8 @@ define amdgpu_kernel void @v_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace ; ; VI-LABEL: v_test_sub_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -49,13 +49,13 @@ define amdgpu_kernel void @v_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace ; GFX10-LABEL: v_test_sub_v2i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 @@ -67,10 +67,8 @@ define amdgpu_kernel void @v_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace ; GFX11-LABEL: v_test_sub_v2i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -98,25 +96,25 @@ define amdgpu_kernel void @v_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @s_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in0, ptr addrspace(4) %in1) #1 { ; GFX9-LABEL: s_test_sub_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x0 -; GFX9-NEXT: s_load_dword s9, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s10, s[8:9], 0x0 +; GFX9-NEXT: s_load_dword s11, s[6:7], 0x0 ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_pk_sub_i16 v0, s9, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s10 +; GFX9-NEXT: v_pk_sub_i16 v0, s11, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: s_test_sub_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -139,23 +137,23 @@ define amdgpu_kernel void @s_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace ; GFX10-LABEL: s_test_sub_v2i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX10-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_sub_i16 v0, s2, s3 +; GFX10-NEXT: v_pk_sub_i16 v0, s0, s1 ; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_sub_v2i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[6:7], 0x0 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 @@ -177,7 +175,7 @@ define amdgpu_kernel void @s_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @s_test_sub_self_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in0) #1 { ; GCN-LABEL: s_test_sub_self_v2i16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v0, 0 @@ -187,7 +185,7 @@ define amdgpu_kernel void @s_test_sub_self_v2i16(ptr addrspace(1) %out, ptr addr ; ; GFX10-LABEL: s_test_sub_self_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-NEXT: s_mov_b32 s2, -1 @@ -197,7 +195,7 @@ define amdgpu_kernel void @s_test_sub_self_v2i16(ptr addrspace(1) %out, ptr addr ; ; GFX11-LABEL: s_test_sub_self_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 @@ -216,20 +214,20 @@ define amdgpu_kernel void @s_test_sub_self_v2i16(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @s_test_sub_v2i16_kernarg(ptr addrspace(1) %out, <2 x i16> %a, <2 x i16> %b) #1 { ; GFX9-LABEL: s_test_sub_v2i16_kernarg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 -; GFX9-NEXT: v_pk_sub_i16 v0, s2, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: v_pk_sub_i16 v0, s6, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: s_test_sub_v2i16_kernarg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -248,19 +246,19 @@ define amdgpu_kernel void @s_test_sub_v2i16_kernarg(ptr addrspace(1) %out, <2 x ; ; GFX10-LABEL: s_test_sub_v2i16_kernarg: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_sub_i16 v0, s2, s3 -; GFX10-NEXT: s_mov_b32 s4, s0 -; GFX10-NEXT: s_mov_b32 s5, s1 -; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX10-NEXT: v_pk_sub_i16 v0, s6, s7 +; GFX10-NEXT: s_mov_b32 s0, s4 +; GFX10-NEXT: s_mov_b32 s1, s5 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_sub_v2i16_kernarg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -279,7 +277,7 @@ define amdgpu_kernel void @s_test_sub_v2i16_kernarg(ptr addrspace(1) %out, <2 x define amdgpu_kernel void @v_test_sub_v2i16_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { ; GFX9-LABEL: v_test_sub_v2i16_constant: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_mov_b32 s4, 0x1c8007b ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -293,7 +291,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_constant(ptr addrspace(1) %out, ptr ; ; VI-LABEL: v_test_sub_v2i16_constant: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -312,7 +310,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_constant(ptr addrspace(1) %out, ptr ; ; GFX10-LABEL: v_test_sub_v2i16_constant: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc @@ -326,9 +324,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_constant(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: v_test_sub_v2i16_constant: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc @@ -353,7 +349,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_constant(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { ; GFX9-LABEL: v_test_sub_v2i16_neg_constant: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_mov_b32 s4, 0xfc21fcb3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -367,7 +363,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(ptr addrspace(1) %out, ; ; VI-LABEL: v_test_sub_v2i16_neg_constant: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -386,7 +382,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(ptr addrspace(1) %out, ; ; GFX10-LABEL: v_test_sub_v2i16_neg_constant: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc @@ -400,9 +396,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(ptr addrspace(1) %out, ; ; GFX11-LABEL: v_test_sub_v2i16_neg_constant: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc @@ -426,7 +420,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { ; GFX9-LABEL: v_test_sub_v2i16_inline_neg1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc @@ -439,7 +433,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(ptr addrspace(1) %out, p ; ; VI-LABEL: v_test_sub_v2i16_inline_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -458,7 +452,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(ptr addrspace(1) %out, p ; ; GFX10-LABEL: v_test_sub_v2i16_inline_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc @@ -472,9 +466,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(ptr addrspace(1) %out, p ; ; GFX11-LABEL: v_test_sub_v2i16_inline_neg1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc @@ -498,7 +490,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(ptr addrspace(1) %out, p define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { ; GFX9-LABEL: v_test_sub_v2i16_inline_lo_zero_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc @@ -511,7 +503,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(ptr addrspace(1) % ; ; VI-LABEL: v_test_sub_v2i16_inline_lo_zero_hi: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -529,7 +521,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(ptr addrspace(1) % ; ; GFX10-LABEL: v_test_sub_v2i16_inline_lo_zero_hi: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc @@ -543,9 +535,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(ptr addrspace(1) % ; ; GFX11-LABEL: v_test_sub_v2i16_inline_lo_zero_hi: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc @@ -570,7 +560,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(ptr addrspace(1) % define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { ; GFX9-LABEL: v_test_sub_v2i16_inline_fp_split: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc @@ -583,7 +573,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(ptr addrspace(1) %ou ; ; VI-LABEL: v_test_sub_v2i16_inline_fp_split: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -601,7 +591,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(ptr addrspace(1) %ou ; ; GFX10-LABEL: v_test_sub_v2i16_inline_fp_split: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc @@ -615,9 +605,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(ptr addrspace(1) %ou ; ; GFX11-LABEL: v_test_sub_v2i16_inline_fp_split: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc @@ -642,13 +630,13 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(ptr addrspace(1) %ou define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 @@ -660,8 +648,8 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(ptr addrspace(1) %out, ; ; VI-LABEL: v_test_sub_v2i16_zext_to_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -684,13 +672,13 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(ptr addrspace(1) %out, ; GFX10-LABEL: v_test_sub_v2i16_zext_to_v2i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 @@ -704,10 +692,8 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(ptr addrspace(1) %out, ; GFX11-LABEL: v_test_sub_v2i16_zext_to_v2i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -740,14 +726,14 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[0:1] glc +; GFX9-NEXT: global_load_dword v3, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 @@ -760,8 +746,8 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(ptr addrspace(1) %out, ; ; VI-LABEL: v_test_sub_v2i16_zext_to_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -786,13 +772,13 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(ptr addrspace(1) %out, ; GFX10-LABEL: v_test_sub_v2i16_zext_to_v2i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 @@ -808,10 +794,8 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(ptr addrspace(1) %out, ; GFX11-LABEL: v_test_sub_v2i16_zext_to_v2i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -845,13 +829,13 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 @@ -863,8 +847,8 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(ptr addrspace(1) %out, ; ; VI-LABEL: v_test_sub_v2i16_sext_to_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -889,13 +873,13 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(ptr addrspace(1) %out, ; GFX10-LABEL: v_test_sub_v2i16_sext_to_v2i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 @@ -909,10 +893,8 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(ptr addrspace(1) %out, ; GFX11-LABEL: v_test_sub_v2i16_sext_to_v2i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -945,12 +927,12 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -965,8 +947,8 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(ptr addrspace(1) %out, ; ; VI-LABEL: v_test_sub_v2i16_sext_to_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -992,13 +974,13 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(ptr addrspace(1) %out, ; GFX10-LABEL: v_test_sub_v2i16_sext_to_v2i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s6, -1 @@ -1015,10 +997,8 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(ptr addrspace(1) %out, ; GFX11-LABEL: v_test_sub_v2i16_sext_to_v2i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll index dfd9a650ff0e9..f686aad0cefc2 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: udiv_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -44,7 +44,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; VI-LABEL: udiv_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -80,7 +80,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; GCN-LABEL: udiv_i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -112,7 +112,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; GFX1030-LABEL: udiv_i32: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -185,7 +185,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; SI-LABEL: s_udiv_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -218,7 +218,7 @@ define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; ; VI-LABEL: s_udiv_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -251,7 +251,7 @@ define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; ; GCN-LABEL: s_udiv_i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s3 ; GCN-NEXT: s_sub_i32 s4, 0, s3 @@ -282,7 +282,7 @@ define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; ; GFX1030-LABEL: s_udiv_i32: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_cvt_f32_u32_e32 v0, s3 ; GFX1030-NEXT: s_sub_i32 s5, 0, s3 @@ -346,7 +346,7 @@ define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: udiv_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -401,7 +401,7 @@ define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: udiv_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -456,7 +456,7 @@ define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GCN-LABEL: udiv_v2i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -507,7 +507,7 @@ define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX1030-LABEL: udiv_v2i32: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v4, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] @@ -619,7 +619,7 @@ define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: udiv_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s6, s10 @@ -714,7 +714,7 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: udiv_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s6, s10 @@ -809,7 +809,7 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GCN-LABEL: udiv_v4i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_add_u32 s4, s2, 16 ; GCN-NEXT: s_addc_u32 s5, s3, 0 @@ -904,7 +904,7 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX1030-LABEL: udiv_v4i32: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v8, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: s_clause 0x1 @@ -1098,7 +1098,7 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @udiv_i32_div_pow2(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: udiv_i32_div_pow2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1116,7 +1116,7 @@ define amdgpu_kernel void @udiv_i32_div_pow2(ptr addrspace(1) %out, ptr addrspac ; ; VI-LABEL: udiv_i32_div_pow2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1134,7 +1134,7 @@ define amdgpu_kernel void @udiv_i32_div_pow2(ptr addrspace(1) %out, ptr addrspac ; ; GCN-LABEL: udiv_i32_div_pow2: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -1148,7 +1148,7 @@ define amdgpu_kernel void @udiv_i32_div_pow2(ptr addrspace(1) %out, ptr addrspac ; ; GFX1030-LABEL: udiv_i32_div_pow2: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_dword v1, v0, s[2:3] @@ -1183,7 +1183,7 @@ define amdgpu_kernel void @udiv_i32_div_pow2(ptr addrspace(1) %out, ptr addrspac define amdgpu_kernel void @udiv_i32_div_k_even(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: udiv_i32_div_k_even: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1203,7 +1203,7 @@ define amdgpu_kernel void @udiv_i32_div_k_even(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: udiv_i32_div_k_even: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1223,7 +1223,7 @@ define amdgpu_kernel void @udiv_i32_div_k_even(ptr addrspace(1) %out, ptr addrsp ; ; GCN-LABEL: udiv_i32_div_k_even: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -1239,7 +1239,7 @@ define amdgpu_kernel void @udiv_i32_div_k_even(ptr addrspace(1) %out, ptr addrsp ; ; GFX1030-LABEL: udiv_i32_div_k_even: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_dword v1, v0, s[2:3] @@ -1277,7 +1277,7 @@ define amdgpu_kernel void @udiv_i32_div_k_even(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @udiv_i32_div_k_odd(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: udiv_i32_div_k_odd: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1297,7 +1297,7 @@ define amdgpu_kernel void @udiv_i32_div_k_odd(ptr addrspace(1) %out, ptr addrspa ; ; VI-LABEL: udiv_i32_div_k_odd: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1317,7 +1317,7 @@ define amdgpu_kernel void @udiv_i32_div_k_odd(ptr addrspace(1) %out, ptr addrspa ; ; GCN-LABEL: udiv_i32_div_k_odd: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -1333,7 +1333,7 @@ define amdgpu_kernel void @udiv_i32_div_k_odd(ptr addrspace(1) %out, ptr addrspa ; ; GFX1030-LABEL: udiv_i32_div_k_odd: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_dword v1, v0, s[2:3] @@ -1371,7 +1371,7 @@ define amdgpu_kernel void @udiv_i32_div_k_odd(ptr addrspace(1) %out, ptr addrspa define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_udiv_i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1400,7 +1400,7 @@ define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; VI-LABEL: v_udiv_i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1429,7 +1429,7 @@ define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GCN-LABEL: v_udiv_i8: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -1452,7 +1452,7 @@ define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX1030-LABEL: v_udiv_i8: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_ushort v1, v0, s[2:3] @@ -1511,7 +1511,7 @@ define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_udiv_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1540,7 +1540,7 @@ define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: v_udiv_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1569,7 +1569,7 @@ define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GCN-LABEL: v_udiv_i16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -1592,7 +1592,7 @@ define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX1030-LABEL: v_udiv_i16: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_dword v1, v0, s[2:3] @@ -1651,7 +1651,7 @@ define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_udiv_i23: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1688,7 +1688,7 @@ define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: v_udiv_i23: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1725,7 +1725,7 @@ define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GCN-LABEL: v_udiv_i23: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_add_u32 s4, s2, 4 ; GCN-NEXT: s_addc_u32 s5, s3, 0 @@ -1770,7 +1770,7 @@ define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX1030-LABEL: v_udiv_i23: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: s_clause 0x3 @@ -1848,7 +1848,7 @@ define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_udiv_i24: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1885,7 +1885,7 @@ define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: v_udiv_i24: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1922,7 +1922,7 @@ define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GCN-LABEL: v_udiv_i24: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_add_u32 s4, s2, 4 ; GCN-NEXT: s_addc_u32 s5, s3, 0 @@ -1967,7 +1967,7 @@ define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX1030-LABEL: v_udiv_i24: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: s_clause 0x3 @@ -2048,7 +2048,7 @@ define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @scalarize_mulhu_4xi32(ptr addrspace(1) nocapture readonly %in, ptr addrspace(1) nocapture %out) { ; SI-LABEL: scalarize_mulhu_4xi32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2076,7 +2076,7 @@ define amdgpu_kernel void @scalarize_mulhu_4xi32(ptr addrspace(1) nocapture read ; ; VI-LABEL: scalarize_mulhu_4xi32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2104,7 +2104,7 @@ define amdgpu_kernel void @scalarize_mulhu_4xi32(ptr addrspace(1) nocapture read ; ; GCN-LABEL: scalarize_mulhu_4xi32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 @@ -2130,7 +2130,7 @@ define amdgpu_kernel void @scalarize_mulhu_4xi32(ptr addrspace(1) nocapture read ; ; GFX1030-LABEL: scalarize_mulhu_4xi32: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v4, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1] @@ -2193,7 +2193,7 @@ define amdgpu_kernel void @scalarize_mulhu_4xi32(ptr addrspace(1) nocapture read define amdgpu_kernel void @test_udiv2(i32 %p) { ; SI-LABEL: test_udiv2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2205,7 +2205,7 @@ define amdgpu_kernel void @test_udiv2(i32 %p) { ; ; VI-LABEL: test_udiv2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2217,7 +2217,7 @@ define amdgpu_kernel void @test_udiv2(i32 %p) { ; ; GCN-LABEL: test_udiv2: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s0, s[6:7], 0x0 +; GCN-NEXT: s_load_dword s0, s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshr_b32 s0, s0, 1 ; GCN-NEXT: v_mov_b32_e32 v0, s0 @@ -2227,7 +2227,7 @@ define amdgpu_kernel void @test_udiv2(i32 %p) { ; ; GFX1030-LABEL: test_udiv2: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX1030-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: s_lshr_b32 s0, s0, 1 ; GFX1030-NEXT: v_mov_b32_e32 v0, s0 @@ -2253,7 +2253,7 @@ define amdgpu_kernel void @test_udiv2(i32 %p) { define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) { ; SI-LABEL: test_udiv_3_mulhu: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0x9 ; SI-NEXT: v_mov_b32_e32 v0, 0xaaaaaaab ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -2266,7 +2266,7 @@ define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) { ; ; VI-LABEL: test_udiv_3_mulhu: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0xaaaaaaab ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -2279,7 +2279,7 @@ define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) { ; ; GCN-LABEL: test_udiv_3_mulhu: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s0, s[6:7], 0x0 +; GCN-NEXT: s_load_dword s0, s[4:5], 0x0 ; GCN-NEXT: v_mov_b32_e32 v0, 0xaaaaaaab ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mul_hi_u32 v0, s0, v0 @@ -2290,7 +2290,7 @@ define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) { ; ; GFX1030-LABEL: test_udiv_3_mulhu: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX1030-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: s_mul_hi_u32 s0, s0, 0xaaaaaaab ; GFX1030-NEXT: s_lshr_b32 s0, s0, 1 diff --git a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll index a8f3635416cff..65eb1cee42350 100644 --- a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll +++ b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll @@ -13,37 +13,37 @@ declare double @llvm.fabs.f64(double) define amdgpu_kernel void @v_cnd_nan_nosgpr(ptr addrspace(1) %out, i32 %c, ptr addrspace(1) %fptr) #0 { ; SI-LABEL: v_cnd_nan_nosgpr: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, s7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_cmp_eq_u32 s8, 0 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cnd_nan_nosgpr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_eq_u32 s4, 0 +; VI-NEXT: s_cmp_eq_u32 s2, 0 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc @@ -54,37 +54,35 @@ define amdgpu_kernel void @v_cnd_nan_nosgpr(ptr addrspace(1) %out, i32 %c, ptr a ; ; GFX10-LABEL: v_cnd_nan_nosgpr: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[0:1] +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cmp_eq_u32 s4, 0 ; GFX10-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_cnd_nan_nosgpr: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] +; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_eq_u32 s4, 0 +; GFX11-NEXT: s_cmp_eq_u32 s2, 0 ; GFX11-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc @@ -109,7 +107,7 @@ define amdgpu_kernel void @v_cnd_nan_nosgpr(ptr addrspace(1) %out, i32 %c, ptr a define amdgpu_kernel void @v_cnd_nan(ptr addrspace(1) %out, i32 %c, float %f) #0 { ; SI-LABEL: v_cnd_nan: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -124,7 +122,7 @@ define amdgpu_kernel void @v_cnd_nan(ptr addrspace(1) %out, i32 %c, float %f) #0 ; ; VI-LABEL: v_cnd_nan: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 ; VI-NEXT: v_mov_b32_e32 v0, s3 @@ -137,18 +135,18 @@ define amdgpu_kernel void @v_cnd_nan(ptr addrspace(1) %out, i32 %c, float %f) #0 ; ; GFX10-LABEL: v_cnd_nan: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_cmp_eq_u32 s2, 0 -; GFX10-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, -1, s3, s[4:5] -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_cmp_eq_u32 s6, 0 +; GFX10-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, -1, s7, s[0:1] +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_cnd_nan: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_cmp_eq_u32 s2, 0 @@ -171,30 +169,30 @@ define amdgpu_kernel void @v_cnd_nan(ptr addrspace(1) %out, i32 %c, float %f) #0 define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %out, [8 x i32], float %x, float %z) #0 { ; SI-LABEL: fcmp_sgprX_k0_select_k1_sgprZ_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x13 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s5 -; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0 +; SI-NEXT: v_mov_b32_e32 v2, s1 +; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 ; SI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fcmp_sgprX_k0_select_k1_sgprZ_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x4c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v2, s3 -; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 ; VI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -202,27 +200,26 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %o ; GFX10-LABEL: fcmp_sgprX_k0_select_k1_sgprZ_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cmp_nlg_f32_e64 s[2:3], s0, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, s1, s[2:3] +; GFX10-NEXT: v_cmp_nlg_f32_e64 s[0:1], s2, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, s3, s[0:1] ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_sgprX_k0_select_k1_sgprZ_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x4c -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nlg_f32_e64 s[4:5], s0, 0 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, s1, s[4:5] -; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: v_cmp_nlg_f32_e64 s[4:5], s2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, s3, s[4:5] +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -238,30 +235,30 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %o define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprX_f32(ptr addrspace(1) %out, float %x) #0 { ; SI-LABEL: fcmp_sgprX_k0_select_k1_sgprX_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s4 -; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0 +; SI-NEXT: v_mov_b32_e32 v2, s0 +; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 ; SI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fcmp_sgprX_k0_select_k1_sgprX_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 ; VI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -269,25 +266,24 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprX_f32(ptr addrspace(1) %o ; GFX10-LABEL: fcmp_sgprX_k0_select_k1_sgprX_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cmp_nlg_f32_e64 s[2:3], s4, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, s4, s[2:3] -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: v_cmp_nlg_f32_e64 s[0:1], s4, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, s4, s[0:1] +; GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_sgprX_k0_select_k1_sgprX_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_nlg_f32_e64 s[2:3], s4, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, s4, s[2:3] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -305,30 +301,30 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprX_f32(ptr addrspace(1) %o define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprZ_f32(ptr addrspace(1) %out, [8 x i32], float %x, float %z) #0 { ; SI-LABEL: fcmp_sgprX_k0_select_k0_sgprZ_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x13 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s5 -; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0 +; SI-NEXT: v_mov_b32_e32 v2, s1 +; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 ; SI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fcmp_sgprX_k0_select_k0_sgprZ_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x4c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v2, s3 -; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 ; VI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -336,27 +332,26 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprZ_f32(ptr addrspace(1) %o ; GFX10-LABEL: fcmp_sgprX_k0_select_k0_sgprZ_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cmp_nlg_f32_e64 s[2:3], s0, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, s1, s[2:3] +; GFX10-NEXT: v_cmp_nlg_f32_e64 s[0:1], s2, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, s3, s[0:1] ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_sgprX_k0_select_k0_sgprZ_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x4c -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nlg_f32_e64 s[4:5], s0, 0 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, s1, s[4:5] -; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: v_cmp_nlg_f32_e64 s[4:5], s2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, s3, s[4:5] +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -372,30 +367,30 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprZ_f32(ptr addrspace(1) %o define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprX_f32(ptr addrspace(1) %out, float %x) #0 { ; SI-LABEL: fcmp_sgprX_k0_select_k0_sgprX_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s4 -; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0 +; SI-NEXT: v_mov_b32_e32 v2, s0 +; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 ; SI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fcmp_sgprX_k0_select_k0_sgprX_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 ; VI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -403,25 +398,24 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprX_f32(ptr addrspace(1) %o ; GFX10-LABEL: fcmp_sgprX_k0_select_k0_sgprX_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cmp_nlg_f32_e64 s[2:3], s4, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, s4, s[2:3] -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: v_cmp_nlg_f32_e64 s[0:1], s4, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, s4, s[0:1] +; GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_sgprX_k0_select_k0_sgprX_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_nlg_f32_e64 s[2:3], s4, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, s4, s[2:3] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -439,16 +433,16 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprX_f32(ptr addrspace(1) %o define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_vgprZ_f32(ptr addrspace(1) %out, float %x, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: fcmp_sgprX_k0_select_k0_vgprZ_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -457,20 +451,20 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_vgprZ_f32(ptr addrspace(1) %o ; ; VI-LABEL: fcmp_sgprX_k0_select_k0_vgprZ_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 +; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -478,34 +472,32 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_vgprZ_f32(ptr addrspace(1) %o ; ; GFX10-LABEL: fcmp_sgprX_k0_select_k0_vgprZ_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_sgprX_k0_select_k0_vgprZ_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0 +; GFX11-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -526,16 +518,16 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_vgprZ_f32(ptr addrspace(1) %o define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %out, float %x, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: fcmp_sgprX_k0_select_k1_vgprZ_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -544,20 +536,20 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %o ; ; VI-LABEL: fcmp_sgprX_k0_select_k1_vgprZ_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 +; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -565,34 +557,32 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %o ; ; GFX10-LABEL: fcmp_sgprX_k0_select_k1_vgprZ_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_sgprX_k0_select_k1_vgprZ_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0 +; GFX11-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -613,8 +603,8 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %o define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, float %z) #0 { ; SI-LABEL: fcmp_vgprX_k0_select_k1_sgprZ_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -632,8 +622,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %o ; ; VI-LABEL: fcmp_vgprX_k0_select_k1_sgprZ_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -652,10 +642,9 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %o ; ; GFX10-LABEL: fcmp_vgprX_k0_select_k1_sgprZ_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -666,12 +655,9 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %o ; ; GFX11-LABEL: fcmp_vgprX_k0_select_k1_sgprZ_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -695,8 +681,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %o define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -716,8 +702,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %o ; ; VI-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -741,13 +727,13 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %o ; GFX10-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_le_f32_e32 vcc, 0, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc @@ -757,10 +743,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %o ; GFX11-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -789,8 +773,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %o define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -810,8 +794,8 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(ptr addrspace(1) %o ; ; VI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -835,13 +819,13 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(ptr addrspace(1) %o ; GFX10-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 2, v2, vcc @@ -851,10 +835,8 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(ptr addrspace(1) %o ; GFX11-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -883,8 +865,8 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(ptr addrspace(1) %o define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -905,8 +887,8 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(ptr addrspace(1) %o ; ; VI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -931,13 +913,13 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(ptr addrspace(1) %o ; GFX10-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] glc dlc +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[0:1] ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc @@ -948,10 +930,8 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(ptr addrspace(1) %o ; GFX11-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v4, s[6:7] glc dlc @@ -981,8 +961,8 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(ptr addrspace(1) %o define amdgpu_kernel void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 @@ -1007,8 +987,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(ptr addrspace(1) ; ; VI-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VI-NEXT: v_lshlrev_b32_e32 v5, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1036,14 +1016,14 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(ptr addrspace(1) ; GFX10-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 4, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v6, v4, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[0:3], v5, s[0:1] glc dlc +; GFX10-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_nge_f32_e32 vcc, 4.0, v6 ; GFX10-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc @@ -1056,10 +1036,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(ptr addrspace(1) ; GFX11-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1092,8 +1070,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(ptr addrspace(1) define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 @@ -1118,8 +1096,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(ptr addrspace(1) ; ; VI-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VI-NEXT: v_lshlrev_b32_e32 v5, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1147,14 +1125,14 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(ptr addrspace(1) ; GFX10-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 4, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v6, v4, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[0:3], v5, s[0:1] glc dlc +; GFX10-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_ge_f32_e32 vcc, 4.0, v6 ; GFX10-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc @@ -1167,10 +1145,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(ptr addrspace(1) ; GFX11-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1205,8 +1181,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(ptr addrspace(1) define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 @@ -1231,8 +1207,8 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(ptr addrspace(1) ; ; VI-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VI-NEXT: v_lshlrev_b32_e32 v5, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1260,14 +1236,14 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(ptr addrspace(1) ; GFX10-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 4, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v6, v4, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[0:3], v5, s[0:1] glc dlc +; GFX10-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_le_f32_e32 vcc, 4.0, v6 ; GFX10-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc @@ -1280,10 +1256,8 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(ptr addrspace(1) ; GFX11-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1316,8 +1290,8 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(ptr addrspace(1) define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s11, 0xf000 @@ -1341,8 +1315,8 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(ptr addrspace(1) %ou ; ; VI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s7 @@ -1369,13 +1343,13 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(ptr addrspace(1) %ou ; GFX10-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v2, v1, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_ubyte v3, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_ubyte v3, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2 ; GFX10-NEXT: v_and_b32_e32 v1, 1, v3 @@ -1388,10 +1362,8 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(ptr addrspace(1) %ou ; GFX11-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v1, s[6:7] glc dlc @@ -1426,8 +1398,8 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(ptr addrspace(1) %ou define amdgpu_kernel void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 @@ -1451,8 +1423,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(ptr addrspace(1) ; ; VI-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VI-NEXT: v_lshlrev_b32_e32 v5, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1479,14 +1451,14 @@ define amdgpu_kernel void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(ptr addrspace(1) ; GFX10-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v4, v2, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[0:1] glc dlc +; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_le_f32_e32 vcc, 0, v4 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x3ff00000, v1, vcc @@ -1497,10 +1469,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(ptr addrspace(1) ; GFX11-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1532,8 +1502,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(ptr addrspace(1) define amdgpu_kernel void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 @@ -1556,8 +1526,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(ptr addrspace(1) ; ; VI-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VI-NEXT: v_lshlrev_b32_e32 v5, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1583,14 +1553,14 @@ define amdgpu_kernel void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(ptr addrspace(1) ; GFX10-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v4, v2, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[0:1] glc dlc +; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v4 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -1601,10 +1571,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(ptr addrspace(1) ; GFX11-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1636,8 +1604,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(ptr addrspace(1) define amdgpu_kernel void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1657,8 +1625,8 @@ define amdgpu_kernel void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(ptr addrspace(1) ; ; VI-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -1682,13 +1650,13 @@ define amdgpu_kernel void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(ptr addrspace(1) ; GFX10-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc, 2, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 4.0, v2, vcc @@ -1698,10 +1666,8 @@ define amdgpu_kernel void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(ptr addrspace(1) ; GFX11-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -1731,8 +1697,8 @@ define amdgpu_kernel void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(ptr addrspace(1) define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1756,8 +1722,8 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(ptr add ; ; VI-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -1785,13 +1751,13 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(ptr add ; GFX10-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_nle_f32_e32 vcc, 4.0, v1 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, -1.0, vcc @@ -1805,10 +1771,8 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(ptr add ; GFX11-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -1844,18 +1808,18 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(ptr add define amdgpu_kernel void @v_cndmask_abs_neg_f16(ptr addrspace(1) %out, i32 %c, ptr addrspace(1) %fptr) #0 { ; SI-LABEL: v_cndmask_abs_neg_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_load_dword s8, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_cmp_lg_u32 s0, 0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_cmp_lg_u32 s8, 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e64 v1, |v0| ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 @@ -1863,22 +1827,22 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f16(ptr addrspace(1) %out, i32 %c, ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cndmask_abs_neg_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ushort v0, v[0:1] -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s4, 0 +; VI-NEXT: s_cmp_lg_u32 s2, 0 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v1, 0x7fff, v0 @@ -1891,15 +1855,15 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f16(ptr addrspace(1) %out, i32 %c, ; ; GFX10-LABEL: v_cndmask_abs_neg_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ushort v0, v0, s[0:1] +; GFX10-NEXT: global_load_ushort v0, v0, s[2:3] ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cmp_lg_u32 s4, 0 ; GFX10-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1907,23 +1871,21 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f16(ptr addrspace(1) %out, i32 %c, ; GFX10-NEXT: v_and_b32_e32 v1, 0x7fff, v0 ; GFX10-NEXT: v_xor_b32_e32 v0, 0x8000, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX10-NEXT: global_store_short v2, v0, s[0:1] +; GFX10-NEXT: global_store_short v2, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_cndmask_abs_neg_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_u16 v0, v0, s[0:1] +; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_lg_u32 s4, 0 +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 ; GFX11-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff, v0 @@ -1948,37 +1910,37 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f16(ptr addrspace(1) %out, i32 %c, define amdgpu_kernel void @v_cndmask_abs_neg_f32(ptr addrspace(1) %out, i32 %c, ptr addrspace(1) %fptr) #0 { ; SI-LABEL: v_cndmask_abs_neg_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, s7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_cmp_lg_u32 s8, 0 -; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 +; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cndmask_b32_e64 v0, -v0, |v0|, s[4:5] -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: v_cndmask_b32_e64 v0, -v0, |v0|, s[0:1] +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cndmask_abs_neg_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s4, 0 +; VI-NEXT: s_cmp_lg_u32 s2, 0 ; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cndmask_b32_e64 v2, -v0, |v0|, s[2:3] @@ -1989,37 +1951,35 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f32(ptr addrspace(1) %out, i32 %c, ; ; GFX10-LABEL: v_cndmask_abs_neg_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[0:1] +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cmp_lg_u32 s4, 0 -; GFX10-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX10-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cndmask_b32_e64 v0, -v0, |v0|, s[2:3] -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_cndmask_b32_e64 v0, -v0, |v0|, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_cndmask_abs_neg_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] +; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_lg_u32 s4, 0 +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 ; GFX11-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cndmask_b32_e64 v0, -v0, |v0|, s[2:3] @@ -2041,18 +2001,18 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f32(ptr addrspace(1) %out, i32 %c, define amdgpu_kernel void @v_cndmask_abs_neg_f64(ptr addrspace(1) %out, i32 %c, ptr addrspace(1) %fptr) #0 { ; SI-LABEL: v_cndmask_abs_neg_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_load_dword s8, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_cmp_lg_u32 s0, 0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_cmp_lg_u32 s8, 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1 ; SI-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 @@ -2060,22 +2020,22 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f64(ptr addrspace(1) %out, i32 %c, ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cndmask_abs_neg_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s4, 0 +; VI-NEXT: s_cmp_lg_u32 s2, 0 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1 @@ -2089,15 +2049,15 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f64(ptr addrspace(1) %out, i32 %c, ; ; GFX10-LABEL: v_cndmask_abs_neg_f64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cmp_lg_u32 s4, 0 ; GFX10-NEXT: s_cselect_b64 vcc, -1, 0 @@ -2106,23 +2066,21 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f64(ptr addrspace(1) %out, i32 %c, ; GFX10-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_cndmask_abs_neg_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v3, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v0, s[0:1] +; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_lg_u32 s4, 0 +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 ; GFX11-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1 diff --git a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll index 8579cbdf47137..e3185e189157b 100644 --- a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll +++ b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll @@ -89,7 +89,7 @@ define <2 x i16> @basic_smax_smin(i16 %src0, i16 %src1) { define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg %src0ext, i32 inreg %src1ext) { ; SDAG-VI-LABEL: basic_smax_smin_sgpr: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0xff ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-VI-NEXT: v_max_i16_e64 v1, s2, 0 @@ -104,20 +104,20 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg ; ; SDAG-GFX9-LABEL: basic_smax_smin_sgpr: ; SDAG-GFX9: ; %bb.0: -; SDAG-GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; SDAG-GFX9-NEXT: v_mov_b32_e32 v1, 0xff ; SDAG-GFX9-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX9-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX9-NEXT: v_med3_i16 v2, s2, 0, v1 -; SDAG-GFX9-NEXT: v_med3_i16 v1, s3, 0, v1 +; SDAG-GFX9-NEXT: v_med3_i16 v2, s6, 0, v1 +; SDAG-GFX9-NEXT: v_med3_i16 v1, s7, 0, v1 ; SDAG-GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SDAG-GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 -; SDAG-GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; SDAG-GFX9-NEXT: s_endpgm ; ; SDAG-GFX11-LABEL: basic_smax_smin_sgpr: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: v_mov_b32_e32 v2, 0 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_med3_i16 v0, s2, 0, 0xff @@ -132,7 +132,7 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg ; ; GISEL-VI-LABEL: basic_smax_smin_sgpr: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-VI-NEXT: s_sext_i32_i16 s4, 0 ; GISEL-VI-NEXT: s_sext_i32_i16 s5, 0xff ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -156,27 +156,27 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg ; ; GISEL-GFX9-LABEL: basic_smax_smin_sgpr: ; GISEL-GFX9: ; %bb.0: -; GISEL-GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GISEL-GFX9-NEXT: s_sext_i32_i16 s4, 0 -; GISEL-GFX9-NEXT: s_sext_i32_i16 s5, 0xff +; GISEL-GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GISEL-GFX9-NEXT: s_sext_i32_i16 s0, 0 +; GISEL-GFX9-NEXT: s_sext_i32_i16 s1, 0xff ; GISEL-GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-GFX9-NEXT: s_sext_i32_i16 s2, s6 +; GISEL-GFX9-NEXT: s_sext_i32_i16 s3, s7 +; GISEL-GFX9-NEXT: s_max_i32 s2, s2, s0 +; GISEL-GFX9-NEXT: s_max_i32 s0, s3, s0 ; GISEL-GFX9-NEXT: s_sext_i32_i16 s2, s2 -; GISEL-GFX9-NEXT: s_sext_i32_i16 s3, s3 -; GISEL-GFX9-NEXT: s_max_i32 s2, s2, s4 -; GISEL-GFX9-NEXT: s_max_i32 s3, s3, s4 -; GISEL-GFX9-NEXT: s_sext_i32_i16 s2, s2 -; GISEL-GFX9-NEXT: s_sext_i32_i16 s3, s3 -; GISEL-GFX9-NEXT: s_min_i32 s2, s2, s5 -; GISEL-GFX9-NEXT: s_min_i32 s3, s3, s5 -; GISEL-GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s3 -; GISEL-GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX9-NEXT: s_sext_i32_i16 s0, s0 +; GISEL-GFX9-NEXT: s_min_i32 s2, s2, s1 +; GISEL-GFX9-NEXT: s_min_i32 s0, s0, s1 +; GISEL-GFX9-NEXT: s_pack_ll_b32_b16 s0, s2, s0 +; GISEL-GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX9-NEXT: global_store_dword v1, v0, s[4:5] ; GISEL-GFX9-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: basic_smax_smin_sgpr: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: s_sext_i32_i16 s4, 0 ; GISEL-GFX11-NEXT: s_sext_i32_i16 s5, 0xff ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 @@ -413,13 +413,13 @@ define <2 x i16> @vec_smax_smin(<2 x i16> %src) { define amdgpu_kernel void @vec_smax_smin_sgpr(ptr addrspace(1) %out, <2 x i16> inreg %src) { ; SDAG-VI-LABEL: vec_smax_smin_sgpr: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0xff ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: s_lshr_b32 s2, s4, 16 -; SDAG-VI-NEXT: v_max_i16_e64 v1, s4, 0 -; SDAG-VI-NEXT: v_max_i16_e64 v2, s2, 0 +; SDAG-VI-NEXT: s_lshr_b32 s3, s2, 16 +; SDAG-VI-NEXT: v_max_i16_e64 v1, s2, 0 +; SDAG-VI-NEXT: v_max_i16_e64 v2, s3, 0 ; SDAG-VI-NEXT: v_min_i16_e32 v1, 0xff, v1 ; SDAG-VI-NEXT: v_min_i16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; SDAG-VI-NEXT: v_or_b32_e32 v2, v1, v0 @@ -430,24 +430,24 @@ define amdgpu_kernel void @vec_smax_smin_sgpr(ptr addrspace(1) %out, <2 x i16> i ; ; SDAG-GFX9-LABEL: vec_smax_smin_sgpr: ; SDAG-GFX9: ; %bb.0: -; SDAG-GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; SDAG-GFX9-NEXT: s_movk_i32 s2, 0xff +; SDAG-GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX9-NEXT: s_movk_i32 s0, 0xff ; SDAG-GFX9-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX9-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX9-NEXT: v_pk_max_i16 v1, s4, 0 -; SDAG-GFX9-NEXT: v_pk_min_i16 v1, v1, s2 op_sel_hi:[1,0] -; SDAG-GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX9-NEXT: v_pk_min_i16 v1, v1, s0 op_sel_hi:[1,0] +; SDAG-GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX9-NEXT: s_endpgm ; ; SDAG-GFX11-LABEL: vec_smax_smin_sgpr: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_pk_max_i16 v0, s4, 0 +; SDAG-GFX11-NEXT: v_pk_max_i16 v0, s2, 0 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1] ; SDAG-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -457,24 +457,24 @@ define amdgpu_kernel void @vec_smax_smin_sgpr(ptr addrspace(1) %out, <2 x i16> i ; ; GISEL-VI-LABEL: vec_smax_smin_sgpr: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GISEL-VI-NEXT: s_sext_i32_i16 s2, 0 +; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-VI-NEXT: s_sext_i32_i16 s3, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: s_lshr_b32 s3, s4, 16 +; GISEL-VI-NEXT: s_lshr_b32 s4, s2, 16 +; GISEL-VI-NEXT: s_sext_i32_i16 s2, s2 ; GISEL-VI-NEXT: s_sext_i32_i16 s4, s4 -; GISEL-VI-NEXT: s_sext_i32_i16 s3, s3 -; GISEL-VI-NEXT: s_max_i32 s4, s4, s2 -; GISEL-VI-NEXT: s_max_i32 s2, s3, s2 -; GISEL-VI-NEXT: s_sext_i32_i16 s3, s4 +; GISEL-VI-NEXT: s_max_i32 s2, s2, s3 +; GISEL-VI-NEXT: s_max_i32 s3, s4, s3 ; GISEL-VI-NEXT: s_sext_i32_i16 s4, 0xff +; GISEL-VI-NEXT: s_sext_i32_i16 s3, s3 ; GISEL-VI-NEXT: s_sext_i32_i16 s2, s2 -; GISEL-VI-NEXT: s_min_i32 s2, s2, s4 ; GISEL-VI-NEXT: s_min_i32 s3, s3, s4 -; GISEL-VI-NEXT: s_and_b32 s2, 0xffff, s2 +; GISEL-VI-NEXT: s_min_i32 s2, s2, s4 ; GISEL-VI-NEXT: s_and_b32 s3, 0xffff, s3 -; GISEL-VI-NEXT: s_lshl_b32 s2, s2, 16 -; GISEL-VI-NEXT: s_or_b32 s2, s3, s2 +; GISEL-VI-NEXT: s_and_b32 s2, 0xffff, s2 +; GISEL-VI-NEXT: s_lshl_b32 s3, s3, 16 +; GISEL-VI-NEXT: s_or_b32 s2, s2, s3 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -483,40 +483,40 @@ define amdgpu_kernel void @vec_smax_smin_sgpr(ptr addrspace(1) %out, <2 x i16> i ; ; GISEL-GFX9-LABEL: vec_smax_smin_sgpr: ; GISEL-GFX9: ; %bb.0: -; GISEL-GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GISEL-GFX9-NEXT: s_sext_i32_i16 s2, 0 +; GISEL-GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX9-NEXT: s_sext_i32_i16 s0, 0 ; GISEL-GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX9-NEXT: s_sext_i32_i16 s3, s4 +; GISEL-GFX9-NEXT: s_sext_i32_i16 s1, s4 ; GISEL-GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GISEL-GFX9-NEXT: s_max_i32 s2, s3, s2 -; GISEL-GFX9-NEXT: s_max_i32 s3, s4, 0 -; GISEL-GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s3 -; GISEL-GFX9-NEXT: s_sext_i32_i16 s3, s2 -; GISEL-GFX9-NEXT: s_ashr_i32 s2, s2, 16 +; GISEL-GFX9-NEXT: s_max_i32 s0, s1, s0 +; GISEL-GFX9-NEXT: s_max_i32 s1, s4, 0 +; GISEL-GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GISEL-GFX9-NEXT: s_sext_i32_i16 s1, s0 +; GISEL-GFX9-NEXT: s_ashr_i32 s0, s0, 16 ; GISEL-GFX9-NEXT: s_sext_i32_i16 s4, 0xff00ff -; GISEL-GFX9-NEXT: s_min_i32 s3, s3, s4 -; GISEL-GFX9-NEXT: s_min_i32 s2, s2, 0xff -; GISEL-GFX9-NEXT: s_pack_ll_b32_b16 s2, s3, s2 -; GISEL-GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX9-NEXT: s_min_i32 s1, s1, s4 +; GISEL-GFX9-NEXT: s_min_i32 s0, s0, 0xff +; GISEL-GFX9-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GISEL-GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX9-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX9-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: vec_smax_smin_sgpr: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GISEL-GFX11-NEXT: s_sext_i32_i16 s2, 0 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_sext_i32_i16 s3, 0 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: s_sext_i32_i16 s3, s4 -; GISEL-GFX11-NEXT: s_ashr_i32 s4, s4, 16 -; GISEL-GFX11-NEXT: s_max_i32 s2, s3, s2 -; GISEL-GFX11-NEXT: s_max_i32 s3, s4, 0 +; GISEL-GFX11-NEXT: s_sext_i32_i16 s4, s2 +; GISEL-GFX11-NEXT: s_ashr_i32 s2, s2, 16 +; GISEL-GFX11-NEXT: s_max_i32 s3, s4, s3 +; GISEL-GFX11-NEXT: s_max_i32 s2, s2, 0 ; GISEL-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GISEL-GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s3 +; GISEL-GFX11-NEXT: s_pack_ll_b32_b16 s2, s3, s2 ; GISEL-GFX11-NEXT: s_sext_i32_i16 s3, 0xff00ff ; GISEL-GFX11-NEXT: s_sext_i32_i16 s4, s2 ; GISEL-GFX11-NEXT: s_ashr_i32 s2, s2, 16 diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll index dae46361b9bcc..e12a4beb5dbe5 100644 --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @test_vopc_i32(ptr addrspace(1) %arg) { ; GFX1032-LABEL: test_vopc_i32: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dword v1, v0, s[0:1] @@ -20,7 +20,7 @@ define amdgpu_kernel void @test_vopc_i32(ptr addrspace(1) %arg) { ; ; GFX1064-LABEL: test_vopc_i32: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dword v1, v0, s[0:1] @@ -41,7 +41,7 @@ define amdgpu_kernel void @test_vopc_i32(ptr addrspace(1) %arg) { define amdgpu_kernel void @test_vopc_f32(ptr addrspace(1) %arg) { ; GFX1032-LABEL: test_vopc_f32: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dword v1, v0, s[0:1] @@ -53,7 +53,7 @@ define amdgpu_kernel void @test_vopc_f32(ptr addrspace(1) %arg) { ; ; GFX1064-LABEL: test_vopc_f32: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dword v1, v0, s[0:1] @@ -101,7 +101,7 @@ define amdgpu_ps void @test_vopc_vcmp(float %x) { define amdgpu_kernel void @test_vopc_2xf16(ptr addrspace(1) %arg) { ; GFX1032-LABEL: test_vopc_2xf16: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -114,7 +114,7 @@ define amdgpu_kernel void @test_vopc_2xf16(ptr addrspace(1) %arg) { ; ; GFX1064-LABEL: test_vopc_2xf16: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -138,25 +138,25 @@ define amdgpu_kernel void @test_vopc_class(ptr addrspace(1) %out, float %x) #0 { ; GFX1032-LABEL: test_vopc_class: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_cmp_class_f32_e64 s2, s4, 0x204 -; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 -; GFX1032-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1032-NEXT: v_cmp_class_f32_e64 s0, s4, 0x204 +; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; GFX1032-NEXT: global_store_dword v0, v1, s[2:3] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_vopc_class: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_cmp_class_f32_e64 s[2:3], s4, 0x204 -; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] -; GFX1064-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1064-NEXT: v_cmp_class_f32_e64 s[0:1], s4, 0x204 +; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; GFX1064-NEXT: global_store_dword v0, v1, s[2:3] ; GFX1064-NEXT: s_endpgm %fabs = tail call float @llvm.fabs.f32(float %x) %cmp = fcmp oeq float %fabs, 0x7FF0000000000000 @@ -169,27 +169,27 @@ define amdgpu_kernel void @test_vcmp_vcnd_f16(ptr addrspace(1) %out, half %x) #0 ; GFX1032-LABEL: test_vcmp_vcnd_f16: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v0, s4 ; GFX1032-NEXT: v_cmp_neq_f16_e64 vcc_lo, 0x7c00, s4 ; GFX1032-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v0, vcc_lo -; GFX1032-NEXT: global_store_short v1, v0, s[0:1] +; GFX1032-NEXT: global_store_short v1, v0, s[2:3] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_vcmp_vcnd_f16: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v0, s4 ; GFX1064-NEXT: v_cmp_neq_f16_e64 vcc, 0x7c00, s4 ; GFX1064-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v0, vcc -; GFX1064-NEXT: global_store_short v1, v0, s[0:1] +; GFX1064-NEXT: global_store_short v1, v0, s[2:3] ; GFX1064-NEXT: s_endpgm %cmp = fcmp oeq half %x, 0x7FF0000000000000 %sel = select i1 %cmp, half 1.0, half %x @@ -200,7 +200,7 @@ define amdgpu_kernel void @test_vcmp_vcnd_f16(ptr addrspace(1) %out, half %x) #0 define amdgpu_kernel void @test_vop3_cmp_f32_sop_and(ptr addrspace(1) %arg) { ; GFX1032-LABEL: test_vop3_cmp_f32_sop_and: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dword v1, v0, s[2:3] @@ -214,7 +214,7 @@ define amdgpu_kernel void @test_vop3_cmp_f32_sop_and(ptr addrspace(1) %arg) { ; ; GFX1064-LABEL: test_vop3_cmp_f32_sop_and: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dword v1, v0, s[2:3] @@ -239,7 +239,7 @@ define amdgpu_kernel void @test_vop3_cmp_f32_sop_and(ptr addrspace(1) %arg) { define amdgpu_kernel void @test_vop3_cmp_i32_sop_xor(ptr addrspace(1) %arg) { ; GFX1032-LABEL: test_vop3_cmp_i32_sop_xor: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dword v1, v0, s[2:3] @@ -253,7 +253,7 @@ define amdgpu_kernel void @test_vop3_cmp_i32_sop_xor(ptr addrspace(1) %arg) { ; ; GFX1064-LABEL: test_vop3_cmp_i32_sop_xor: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dword v1, v0, s[2:3] @@ -278,7 +278,7 @@ define amdgpu_kernel void @test_vop3_cmp_i32_sop_xor(ptr addrspace(1) %arg) { define amdgpu_kernel void @test_vop3_cmp_u32_sop_or(ptr addrspace(1) %arg) { ; GFX1032-LABEL: test_vop3_cmp_u32_sop_or: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dword v1, v0, s[2:3] @@ -292,7 +292,7 @@ define amdgpu_kernel void @test_vop3_cmp_u32_sop_or(ptr addrspace(1) %arg) { ; ; GFX1064-LABEL: test_vop3_cmp_u32_sop_or: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dword v1, v0, s[2:3] @@ -318,10 +318,10 @@ define amdgpu_kernel void @test_mask_if(ptr addrspace(1) %arg) #0 { ; GFX1032-LABEL: test_mask_if: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: v_cmp_lt_u32_e32 vcc_lo, 10, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB9_2 ; GFX1032-NEXT: ; %bb.1: ; %if -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_store_dword v0, v0, s[0:1] @@ -331,10 +331,10 @@ define amdgpu_kernel void @test_mask_if(ptr addrspace(1) %arg) #0 { ; GFX1064-LABEL: test_mask_if: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: v_cmp_lt_u32_e32 vcc, 10, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB9_2 ; GFX1064-NEXT: ; %bb.1: ; %if -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_store_dword v0, v0, s[0:1] @@ -355,7 +355,7 @@ endif: define amdgpu_kernel void @test_loop_with_if(ptr addrspace(1) %arg) #0 { ; GFX1032-LABEL: test_loop_with_if: ; GFX1032: ; %bb.0: ; %bb -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: ; implicit-def: $vgpr2_vgpr3 @@ -417,7 +417,7 @@ define amdgpu_kernel void @test_loop_with_if(ptr addrspace(1) %arg) #0 { ; ; GFX1064-LABEL: test_loop_with_if: ; GFX1064: ; %bb.0: ; %bb -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-NEXT: ; implicit-def: $vgpr2_vgpr3 @@ -516,42 +516,42 @@ define amdgpu_kernel void @test_loop_with_if_else_break(ptr addrspace(1) %arg) # ; GFX1032-LABEL: test_loop_with_if_else_break: ; GFX1032: ; %bb.0: ; %bb ; GFX1032-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_mov_b32 s4, 0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB11_6 ; GFX1032-NEXT: ; %bb.1: ; %.preheader -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_min_u32_e32 v1, 0x100, v0 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-NEXT: s_mov_b32 s2, 0 -; GFX1032-NEXT: ; implicit-def: $sgpr3 +; GFX1032-NEXT: s_mov_b32 s3, 0 +; GFX1032-NEXT: ; implicit-def: $sgpr4 ; GFX1032-NEXT: s_branch .LBB11_4 ; GFX1032-NEXT: .LBB11_2: ; %bb8 ; GFX1032-NEXT: ; in Loop: Header=BB11_4 Depth=1 -; GFX1032-NEXT: s_add_i32 s2, s2, 1 +; GFX1032-NEXT: s_add_i32 s3, s3, 1 ; GFX1032-NEXT: global_store_dword v2, v0, s[0:1] -; GFX1032-NEXT: v_cmp_ge_u32_e32 vcc_lo, s2, v1 +; GFX1032-NEXT: v_cmp_ge_u32_e32 vcc_lo, s3, v1 ; GFX1032-NEXT: s_add_u32 s0, s0, 4 ; GFX1032-NEXT: s_addc_u32 s1, s1, 0 -; GFX1032-NEXT: s_andn2_b32 s3, s3, exec_lo +; GFX1032-NEXT: s_andn2_b32 s4, s4, exec_lo ; GFX1032-NEXT: s_and_b32 s5, vcc_lo, exec_lo -; GFX1032-NEXT: s_or_b32 s3, s3, s5 +; GFX1032-NEXT: s_or_b32 s4, s4, s5 ; GFX1032-NEXT: .LBB11_3: ; %Flow ; GFX1032-NEXT: ; in Loop: Header=BB11_4 Depth=1 -; GFX1032-NEXT: s_and_b32 s5, exec_lo, s3 -; GFX1032-NEXT: s_or_b32 s4, s5, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 s2, s5, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execz .LBB11_6 ; GFX1032-NEXT: .LBB11_4: ; %bb2 ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dword v3, v2, s[0:1] -; GFX1032-NEXT: s_or_b32 s3, s3, exec_lo +; GFX1032-NEXT: s_or_b32 s4, s4, exec_lo ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_gt_i32_e32 vcc_lo, 11, v3 ; GFX1032-NEXT: s_cbranch_vccz .LBB11_2 ; GFX1032-NEXT: ; %bb.5: ; in Loop: Header=BB11_4 Depth=1 -; GFX1032-NEXT: ; implicit-def: $sgpr2 +; GFX1032-NEXT: ; implicit-def: $sgpr3 ; GFX1032-NEXT: ; implicit-def: $sgpr0_sgpr1 ; GFX1032-NEXT: s_branch .LBB11_3 ; GFX1032-NEXT: .LBB11_6: ; %.loopexit @@ -561,10 +561,10 @@ define amdgpu_kernel void @test_loop_with_if_else_break(ptr addrspace(1) %arg) # ; GFX1064: ; %bb.0: ; %bb ; GFX1064-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_mov_b32 s6, 0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB11_6 ; GFX1064-NEXT: ; %bb.1: ; %.preheader -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_min_u32_e32 v1, 0x100, v0 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 @@ -631,7 +631,7 @@ bb8: define amdgpu_kernel void @test_addc_vop2b(ptr addrspace(1) %arg, i64 %arg1) #0 { ; GFX1032-LABEL: test_addc_vop2b: ; GFX1032: ; %bb.0: ; %bb -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -643,7 +643,7 @@ define amdgpu_kernel void @test_addc_vop2b(ptr addrspace(1) %arg, i64 %arg1) #0 ; ; GFX1064-LABEL: test_addc_vop2b: ; GFX1064: ; %bb.0: ; %bb -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -664,7 +664,7 @@ bb: define amdgpu_kernel void @test_subbrev_vop2b(ptr addrspace(1) %arg, i64 %arg1) #0 { ; GFX1032-LABEL: test_subbrev_vop2b: ; GFX1032: ; %bb.0: ; %bb -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -676,7 +676,7 @@ define amdgpu_kernel void @test_subbrev_vop2b(ptr addrspace(1) %arg, i64 %arg1) ; ; GFX1064-LABEL: test_subbrev_vop2b: ; GFX1064: ; %bb.0: ; %bb -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -697,7 +697,7 @@ bb: define amdgpu_kernel void @test_subb_vop2b(ptr addrspace(1) %arg, i64 %arg1) #0 { ; GFX1032-LABEL: test_subb_vop2b: ; GFX1032: ; %bb.0: ; %bb -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -709,7 +709,7 @@ define amdgpu_kernel void @test_subb_vop2b(ptr addrspace(1) %arg, i64 %arg1) #0 ; ; GFX1064-LABEL: test_subb_vop2b: ; GFX1064: ; %bb.0: ; %bb -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -730,7 +730,7 @@ bb: define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; GFX1032-LABEL: test_udiv64: ; GFX1032: ; %bb.0: ; %bb -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -892,7 +892,7 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; ; GFX1064-LABEL: test_udiv64: ; GFX1064: ; %bb.0: ; %bb -; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -1063,7 +1063,7 @@ bb: define amdgpu_kernel void @test_div_scale_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX1032-LABEL: test_div_scale_f32: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dword v1, v0, s[2:3] glc dlc @@ -1077,7 +1077,7 @@ define amdgpu_kernel void @test_div_scale_f32(ptr addrspace(1) %out, ptr addrspa ; ; GFX1064-LABEL: test_div_scale_f32: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dword v1, v0, s[2:3] glc dlc @@ -1104,33 +1104,31 @@ define amdgpu_kernel void @test_div_scale_f32(ptr addrspace(1) %out, ptr addrspa define amdgpu_kernel void @test_div_scale_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %in) #0 { ; GFX1032-LABEL: test_div_scale_f64: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX1032-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] glc dlc +; GFX1032-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] glc dlc ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:8 glc dlc +; GFX1032-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:8 glc dlc ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_div_scale_f64 v[0:1], s0, v[0:1], v[2:3], v[0:1] -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: v_div_scale_f64 v[0:1], s2, v[0:1], v[2:3], v[0:1] ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_div_scale_f64: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX1064-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] glc dlc +; GFX1064-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] glc dlc ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:8 glc dlc +; GFX1064-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:8 glc dlc ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], v[2:3], v[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[2:3], v[0:1] ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1064-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -1188,8 +1186,8 @@ define amdgpu_kernel void @test_div_fmas_f32(ptr addrspace(1) %out, float %a, fl ; GFX1032-LABEL: test_div_fmas_f32: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v0, s5 @@ -1197,14 +1195,14 @@ define amdgpu_kernel void @test_div_fmas_f32(ptr addrspace(1) %out, float %a, fl ; GFX1032-NEXT: s_bitcmp1_b32 s7, 0 ; GFX1032-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX1032-NEXT: v_div_fmas_f32 v0, s4, v0, v1 -; GFX1032-NEXT: global_store_dword v2, v0, s[0:1] +; GFX1032-NEXT: global_store_dword v2, v0, s[2:3] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_div_fmas_f32: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v0, s5 @@ -1212,7 +1210,7 @@ define amdgpu_kernel void @test_div_fmas_f32(ptr addrspace(1) %out, float %a, fl ; GFX1064-NEXT: s_bitcmp1_b32 s7, 0 ; GFX1064-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX1064-NEXT: v_div_fmas_f32 v0, s4, v0, v1 -; GFX1064-NEXT: global_store_dword v2, v0, s[0:1] +; GFX1064-NEXT: global_store_dword v2, v0, s[2:3] ; GFX1064-NEXT: s_endpgm %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %d) nounwind readnone store float %result, ptr addrspace(1) %out, align 4 @@ -1223,14 +1221,14 @@ define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, d ; GFX1032-LABEL: test_div_fmas_f64: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 -; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x44 +; GFX1032-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x44 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v0, s8 ; GFX1032-NEXT: v_mov_b32_e32 v1, s9 ; GFX1032-NEXT: v_mov_b32_e32 v2, s10 ; GFX1032-NEXT: v_mov_b32_e32 v3, s11 -; GFX1032-NEXT: s_bitcmp1_b32 s0, 0 +; GFX1032-NEXT: s_bitcmp1_b32 s2, 0 ; GFX1032-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX1032-NEXT: v_div_fmas_f64 v[0:1], s[6:7], v[0:1], v[2:3] ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 @@ -1240,14 +1238,14 @@ define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, d ; GFX1064-LABEL: test_div_fmas_f64: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 -; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x44 +; GFX1064-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x44 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v0, s8 ; GFX1064-NEXT: v_mov_b32_e32 v1, s9 ; GFX1064-NEXT: v_mov_b32_e32 v2, s10 ; GFX1064-NEXT: v_mov_b32_e32 v3, s11 -; GFX1064-NEXT: s_bitcmp1_b32 s0, 0 +; GFX1064-NEXT: s_bitcmp1_b32 s2, 0 ; GFX1064-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX1064-NEXT: v_div_fmas_f64 v[0:1], s[6:7], v[0:1], v[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 @@ -1263,9 +1261,11 @@ define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, d define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(1) %dummy) #0 { ; GFX1032-LABEL: test_div_fmas_f32_i1_phi_vcc: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1032-NEXT: s_clause 0x1 +; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x34 +; GFX1032-NEXT: s_mov_b32 null, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 ; GFX1032-NEXT: s_mov_b32 vcc_lo, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -1290,9 +1290,10 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, p ; GFX1064-LABEL: test_div_fmas_f32_i1_phi_vcc: ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1064-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX1064-NEXT: s_mov_b32 null, 0 ; GFX1064-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v0 ; GFX1064-NEXT: s_mov_b64 vcc, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -1343,40 +1344,40 @@ exit: define amdgpu_kernel void @fdiv_f32(ptr addrspace(1) %out, float %a, float %b) #0 { ; GFX1032-LABEL: fdiv_f32: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_div_scale_f32 v0, s4, s3, s3, s2 +; GFX1032-NEXT: v_div_scale_f32 v0, s0, s7, s7, s6 ; GFX1032-NEXT: v_rcp_f32_e32 v1, v0 ; GFX1032-NEXT: v_fma_f32 v2, -v0, v1, 1.0 ; GFX1032-NEXT: v_fmac_f32_e32 v1, v2, v1 -; GFX1032-NEXT: v_div_scale_f32 v2, vcc_lo, s2, s3, s2 +; GFX1032-NEXT: v_div_scale_f32 v2, vcc_lo, s6, s7, s6 ; GFX1032-NEXT: v_mul_f32_e32 v3, v2, v1 ; GFX1032-NEXT: v_fma_f32 v4, -v0, v3, v2 ; GFX1032-NEXT: v_fmac_f32_e32 v3, v4, v1 ; GFX1032-NEXT: v_fma_f32 v0, -v0, v3, v2 ; GFX1032-NEXT: v_div_fmas_f32 v0, v0, v1, v3 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: v_div_fixup_f32 v0, v0, s3, s2 -; GFX1032-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032-NEXT: v_div_fixup_f32 v0, v0, s7, s6 +; GFX1032-NEXT: global_store_dword v1, v0, s[4:5] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: fdiv_f32: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_div_scale_f32 v0, s[4:5], s3, s3, s2 +; GFX1064-NEXT: v_div_scale_f32 v0, s[0:1], s7, s7, s6 ; GFX1064-NEXT: v_rcp_f32_e32 v1, v0 ; GFX1064-NEXT: v_fma_f32 v2, -v0, v1, 1.0 ; GFX1064-NEXT: v_fmac_f32_e32 v1, v2, v1 -; GFX1064-NEXT: v_div_scale_f32 v2, vcc, s2, s3, s2 +; GFX1064-NEXT: v_div_scale_f32 v2, vcc, s6, s7, s6 ; GFX1064-NEXT: v_mul_f32_e32 v3, v2, v1 ; GFX1064-NEXT: v_fma_f32 v4, -v0, v3, v2 ; GFX1064-NEXT: v_fmac_f32_e32 v3, v4, v1 ; GFX1064-NEXT: v_fma_f32 v0, -v0, v3, v2 ; GFX1064-NEXT: v_div_fmas_f32 v0, v0, v1, v3 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: v_div_fixup_f32 v0, v0, s3, s2 -; GFX1064-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064-NEXT: v_div_fixup_f32 v0, v0, s7, s6 +; GFX1064-NEXT: global_store_dword v1, v0, s[4:5] ; GFX1064-NEXT: s_endpgm entry: %fdiv = fdiv float %a, %b @@ -1388,13 +1389,13 @@ define amdgpu_kernel void @test_br_cc_f16( ; GFX1032-LABEL: test_br_cc_f16: ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_clause 0x1 ; GFX1032-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX1032-NEXT: global_load_ushort v2, v0, s[0:1] +; GFX1032-NEXT: global_load_ushort v2, v0, s[2:3] ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v1, v2 ; GFX1032-NEXT: s_cbranch_vccnz .LBB24_2 @@ -1408,13 +1409,13 @@ define amdgpu_kernel void @test_br_cc_f16( ; GFX1064-LABEL: test_br_cc_f16: ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_clause 0x1 ; GFX1064-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX1064-NEXT: global_load_ushort v2, v0, s[0:1] +; GFX1064-NEXT: global_load_ushort v2, v0, s[2:3] ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_cmp_nlt_f16_e32 vcc, v1, v2 ; GFX1064-NEXT: s_cbranch_vccnz .LBB24_2 @@ -1445,12 +1446,12 @@ two: define amdgpu_kernel void @test_brcc_i1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i1 %val) #0 { ; GCN-LABEL: test_brcc_i1: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_bitcmp0_b32 s0, 0 +; GCN-NEXT: s_bitcmp0_b32 s2, 0 ; GCN-NEXT: s_cbranch_scc1 .LBB25_2 ; GCN-NEXT: ; %bb.1: ; %store -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: v_mov_b32_e32 v1, 0xde ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -1472,14 +1473,14 @@ define amdgpu_kernel void @test_preserve_condition_undef_flag(float %arg, i32 %a ; GFX1032-LABEL: test_preserve_condition_undef_flag: ; GFX1032: ; %bb.0: ; %bb0 ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x2c -; GFX1032-NEXT: s_load_dword s1, s[2:3], 0x24 +; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x24 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_cmp_nlt_f32_e64 s2, s0, 1.0 -; GFX1032-NEXT: v_cmp_nlt_f32_e64 s1, s1, 1.0 -; GFX1032-NEXT: v_cmp_ngt_f32_e64 s0, s0, 0 -; GFX1032-NEXT: s_or_b32 s1, s2, s1 -; GFX1032-NEXT: s_or_b32 s0, s1, s0 +; GFX1032-NEXT: v_cmp_nlt_f32_e64 s0, s2, 1.0 +; GFX1032-NEXT: v_cmp_nlt_f32_e64 s1, s3, 1.0 +; GFX1032-NEXT: v_cmp_ngt_f32_e64 s2, s2, 0 +; GFX1032-NEXT: s_or_b32 s0, s0, s1 +; GFX1032-NEXT: s_or_b32 s0, s0, s2 ; GFX1032-NEXT: s_and_b32 vcc_lo, exec_lo, s0 ; GFX1032-NEXT: s_cbranch_vccnz .LBB26_2 ; GFX1032-NEXT: ; %bb.1: ; %bb1 @@ -1492,11 +1493,11 @@ define amdgpu_kernel void @test_preserve_condition_undef_flag(float %arg, i32 %a ; GFX1064-LABEL: test_preserve_condition_undef_flag: ; GFX1064: ; %bb.0: ; %bb0 ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX1064-NEXT: s_load_dword s5, s[2:3], 0x24 +; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_cmp_nlt_f32_e64 s[0:1], s4, 1.0 -; GFX1064-NEXT: v_cmp_nlt_f32_e64 s[2:3], s5, 1.0 +; GFX1064-NEXT: v_cmp_nlt_f32_e64 s[2:3], s2, 1.0 ; GFX1064-NEXT: v_cmp_ngt_f32_e64 s[4:5], s4, 0 ; GFX1064-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] ; GFX1064-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] @@ -1530,7 +1531,7 @@ bb2: define amdgpu_kernel void @test_invert_true_phi_cond_break_loop(i32 %arg) #0 { ; GFX1032-LABEL: test_invert_true_phi_cond_break_loop: ; GFX1032: ; %bb.0: ; %bb -; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX1032-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX1032-NEXT: ; implicit-def: $sgpr1 ; GFX1032-NEXT: ; implicit-def: $sgpr2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -1568,7 +1569,7 @@ define amdgpu_kernel void @test_invert_true_phi_cond_break_loop(i32 %arg) #0 { ; ; GFX1064-LABEL: test_invert_true_phi_cond_break_loop: ; GFX1064: ; %bb.0: ; %bb -; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX1064-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX1064-NEXT: ; implicit-def: $sgpr2_sgpr3 ; GFX1064-NEXT: ; implicit-def: $sgpr4 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -1633,7 +1634,7 @@ define amdgpu_kernel void @test_movrels_extract_neg_offset_vgpr(ptr addrspace(1) ; GFX1032-LABEL: test_movrels_extract_neg_offset_vgpr: ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: v_add_nc_u32_e32 v0, 0xfffffe00, v0 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo @@ -1648,7 +1649,7 @@ define amdgpu_kernel void @test_movrels_extract_neg_offset_vgpr(ptr addrspace(1) ; GFX1064-LABEL: test_movrels_extract_neg_offset_vgpr: ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: v_add_nc_u32_e32 v0, 0xfffffe00, v0 -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc @@ -1671,29 +1672,29 @@ define amdgpu_kernel void @test_set_inactive(ptr addrspace(1) %out, i32 %in) #0 ; GFX1032-LABEL: test_set_inactive: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v0, s4 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032-NEXT: v_mov_b32_e32 v0, 42 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032-NEXT: global_store_dword v1, v0, s[2:3] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_set_inactive: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v0, s4 ; GFX1064-NEXT: s_not_b64 exec, exec ; GFX1064-NEXT: v_mov_b32_e32 v0, 42 ; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064-NEXT: global_store_dword v1, v0, s[2:3] ; GFX1064-NEXT: s_endpgm %tmp = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) store i32 %tmp, ptr addrspace(1) %out @@ -1703,7 +1704,7 @@ define amdgpu_kernel void @test_set_inactive(ptr addrspace(1) %out, i32 %in) #0 define amdgpu_kernel void @test_set_inactive_64(ptr addrspace(1) %out, i64 %in) #0 { ; GFX1032-LABEL: test_set_inactive_64: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v0, s2 @@ -1717,7 +1718,7 @@ define amdgpu_kernel void @test_set_inactive_64(ptr addrspace(1) %out, i64 %in) ; ; GFX1064-LABEL: test_set_inactive_64: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v0, s2 @@ -2137,23 +2138,23 @@ main_body: define amdgpu_kernel void @test_intr_fcmp_i64(ptr addrspace(1) %out, float %src, float %a) { ; GFX1032-LABEL: test_intr_fcmp_i64: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_cmp_eq_f32_e64 s2, s2, |s3| -; GFX1032-NEXT: v_mov_b32_e32 v0, s2 -; GFX1032-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] +; GFX1032-NEXT: v_cmp_eq_f32_e64 s0, s6, |s7| +; GFX1032-NEXT: v_mov_b32_e32 v0, s0 +; GFX1032-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_intr_fcmp_i64: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, |s3| -; GFX1064-NEXT: v_mov_b32_e32 v0, s2 -; GFX1064-NEXT: v_mov_b32_e32 v1, s3 -; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX1064-NEXT: v_cmp_eq_f32_e64 s[0:1], s6, |s7| +; GFX1064-NEXT: v_mov_b32_e32 v0, s0 +; GFX1064-NEXT: v_mov_b32_e32 v1, s1 +; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX1064-NEXT: s_endpgm %temp = call float @llvm.fabs.f32(float %a) %result = call i64 @llvm.amdgcn.fcmp.i64.f32(float %src, float %temp, i32 1) @@ -2165,26 +2166,26 @@ define amdgpu_kernel void @test_intr_icmp_i64(ptr addrspace(1) %out, i32 %src) { ; GFX1032-LABEL: test_intr_icmp_i64: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u32_e64 s2, 0x64, s4 -; GFX1032-NEXT: v_mov_b32_e32 v0, s2 -; GFX1032-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] +; GFX1032-NEXT: v_cmp_eq_u32_e64 s0, 0x64, s4 +; GFX1032-NEXT: v_mov_b32_e32 v0, s0 +; GFX1032-NEXT: global_store_dwordx2 v1, v[0:1], s[2:3] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_intr_icmp_i64: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u32_e64 s[2:3], 0x64, s4 -; GFX1064-NEXT: v_mov_b32_e32 v0, s2 -; GFX1064-NEXT: v_mov_b32_e32 v1, s3 -; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX1064-NEXT: v_cmp_eq_u32_e64 s[0:1], 0x64, s4 +; GFX1064-NEXT: v_mov_b32_e32 v0, s0 +; GFX1064-NEXT: v_mov_b32_e32 v1, s1 +; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX1064-NEXT: s_endpgm %result = call i64 @llvm.amdgcn.icmp.i64.i32(i32 %src, i32 100, i32 32) store i64 %result, ptr addrspace(1) %out @@ -2194,22 +2195,22 @@ define amdgpu_kernel void @test_intr_icmp_i64(ptr addrspace(1) %out, i32 %src) { define amdgpu_kernel void @test_intr_fcmp_i32(ptr addrspace(1) %out, float %src, float %a) { ; GFX1032-LABEL: test_intr_fcmp_i32: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_cmp_eq_f32_e64 s2, s2, |s3| -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 -; GFX1032-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1032-NEXT: v_cmp_eq_f32_e64 s0, s6, |s7| +; GFX1032-NEXT: v_mov_b32_e32 v1, s0 +; GFX1032-NEXT: global_store_dword v0, v1, s[4:5] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_intr_fcmp_i32: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, |s3| -; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1064-NEXT: v_cmp_eq_f32_e64 s[0:1], s6, |s7| +; GFX1064-NEXT: v_mov_b32_e32 v1, s0 +; GFX1064-NEXT: global_store_dword v0, v1, s[4:5] ; GFX1064-NEXT: s_endpgm %temp = call float @llvm.fabs.f32(float %a) %result = call i32 @llvm.amdgcn.fcmp.i32.f32(float %src, float %temp, i32 1) @@ -2221,25 +2222,25 @@ define amdgpu_kernel void @test_intr_icmp_i32(ptr addrspace(1) %out, i32 %src) { ; GFX1032-LABEL: test_intr_icmp_i32: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u32_e64 s2, 0x64, s4 -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 -; GFX1032-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1032-NEXT: v_cmp_eq_u32_e64 s0, 0x64, s4 +; GFX1032-NEXT: v_mov_b32_e32 v1, s0 +; GFX1032-NEXT: global_store_dword v0, v1, s[2:3] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_intr_icmp_i32: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u32_e64 s[2:3], 0x64, s4 -; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1064-NEXT: v_cmp_eq_u32_e64 s[0:1], 0x64, s4 +; GFX1064-NEXT: v_mov_b32_e32 v1, s0 +; GFX1064-NEXT: global_store_dword v0, v1, s[2:3] ; GFX1064-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i32.i32(i32 %src, i32 100, i32 32) store i32 %result, ptr addrspace(1) %out @@ -2353,7 +2354,7 @@ define amdgpu_ps float @test_ps_live() #0 { define amdgpu_kernel void @test_vccnz_ifcvt_triangle64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX1032-LABEL: test_vccnz_ifcvt_triangle64: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -2373,7 +2374,7 @@ define amdgpu_kernel void @test_vccnz_ifcvt_triangle64(ptr addrspace(1) %out, pt ; ; GFX1064-LABEL: test_vccnz_ifcvt_triangle64: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -2470,7 +2471,7 @@ main_body: define amdgpu_kernel void @icmp64(i32 %n, i32 %s) { ; GFX1032-LABEL: icmp64: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x28 +; GFX1032-NEXT: s_load_dword s0, s[0:1], 0x28 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_cvt_f32_u32_e32 v1, s0 ; GFX1032-NEXT: s_sub_i32 s1, 0, s0 @@ -2504,7 +2505,7 @@ define amdgpu_kernel void @icmp64(i32 %n, i32 %s) { ; ; GFX1064-LABEL: icmp64: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x28 +; GFX1064-NEXT: s_load_dword s0, s[0:1], 0x28 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_cvt_f32_u32_e32 v1, s0 ; GFX1064-NEXT: s_sub_i32 s1, 0, s0 @@ -2565,7 +2566,7 @@ if.end2: ; preds = %if.end define amdgpu_kernel void @fcmp64(float %n, float %s) { ; GFX1032-LABEL: fcmp64: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x28 +; GFX1032-NEXT: s_load_dword s0, s[0:1], 0x28 ; GFX1032-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_div_scale_f32 v1, s1, s0, s0, v0 @@ -2597,7 +2598,7 @@ define amdgpu_kernel void @fcmp64(float %n, float %s) { ; ; GFX1064-LABEL: fcmp64: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dword s2, s[2:3], 0x28 +; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x28 ; GFX1064-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_div_scale_f32 v1, s[0:1], s2, s2, v0 @@ -2657,7 +2658,7 @@ if.end2: ; preds = %if.end define amdgpu_kernel void @icmp32(i32 %n, i32 %s) { ; GFX1032-LABEL: icmp32: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x28 +; GFX1032-NEXT: s_load_dword s0, s[0:1], 0x28 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_cvt_f32_u32_e32 v1, s0 ; GFX1032-NEXT: s_sub_i32 s1, 0, s0 @@ -2691,7 +2692,7 @@ define amdgpu_kernel void @icmp32(i32 %n, i32 %s) { ; ; GFX1064-LABEL: icmp32: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x28 +; GFX1064-NEXT: s_load_dword s0, s[0:1], 0x28 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_cvt_f32_u32_e32 v1, s0 ; GFX1064-NEXT: s_sub_i32 s1, 0, s0 @@ -2751,7 +2752,7 @@ if.end2: ; preds = %if.end define amdgpu_kernel void @fcmp32(float %n, float %s) { ; GFX1032-LABEL: fcmp32: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x28 +; GFX1032-NEXT: s_load_dword s0, s[0:1], 0x28 ; GFX1032-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_div_scale_f32 v1, s1, s0, s0, v0 @@ -2783,7 +2784,7 @@ define amdgpu_kernel void @fcmp32(float %n, float %s) { ; ; GFX1064-LABEL: fcmp32: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dword s2, s[2:3], 0x28 +; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x28 ; GFX1064-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_div_scale_f32 v1, s[0:1], s2, s2, v0 diff --git a/llvm/test/CodeGen/AMDGPU/xor.ll b/llvm/test/CodeGen/AMDGPU/xor.ll index 9fac17f33d0d3..e15fd7f29671a 100644 --- a/llvm/test/CodeGen/AMDGPU/xor.ll +++ b/llvm/test/CodeGen/AMDGPU/xor.ll @@ -5,31 +5,31 @@ define amdgpu_kernel void @xor_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: xor_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s14, s10 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_xor_b32_e32 v1, v3, v1 ; SI-NEXT: v_xor_b32_e32 v0, v2, v0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: xor_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -54,33 +54,33 @@ define amdgpu_kernel void @xor_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in define amdgpu_kernel void @xor_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: xor_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s14, s10 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_xor_b32_e32 v3, v7, v3 ; SI-NEXT: v_xor_b32_e32 v2, v6, v2 ; SI-NEXT: v_xor_b32_e32 v1, v5, v1 ; SI-NEXT: v_xor_b32_e32 v0, v4, v0 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: xor_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -107,8 +107,8 @@ define amdgpu_kernel void @xor_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in define amdgpu_kernel void @xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: xor_i1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s2, s10 @@ -133,8 +133,8 @@ define amdgpu_kernel void @xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ; ; VI-LABEL: xor_i1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -165,32 +165,32 @@ define amdgpu_kernel void @xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, define amdgpu_kernel void @v_xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: v_xor_i1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s14, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: buffer_load_ubyte v0, off, s[12:15], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 glc +; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: v_xor_b32_e32 v0, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 1, v0 -; SI-NEXT: buffer_store_byte v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_xor_i1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -216,30 +216,30 @@ define amdgpu_kernel void @v_xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0 define amdgpu_kernel void @vector_xor_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: vector_xor_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s14, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_xor_b32_e32 v0, v0, v1 -; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: vector_xor_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -263,7 +263,7 @@ define amdgpu_kernel void @vector_xor_i32(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @scalar_xor_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; SI-LABEL: scalar_xor_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -276,7 +276,7 @@ define amdgpu_kernel void @scalar_xor_i32(ptr addrspace(1) %out, i32 %a, i32 %b) ; ; VI-LABEL: scalar_xor_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_xor_b32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -292,8 +292,8 @@ define amdgpu_kernel void @scalar_xor_i32(ptr addrspace(1) %out, i32 %a, i32 %b) define amdgpu_kernel void @scalar_not_i32(ptr addrspace(1) %out, i32 %a) { ; SI-LABEL: scalar_not_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -304,10 +304,10 @@ define amdgpu_kernel void @scalar_not_i32(ptr addrspace(1) %out, i32 %a) { ; ; VI-LABEL: scalar_not_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_not_b32 s2, s4 +; VI-NEXT: s_not_b32 s2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -321,7 +321,7 @@ define amdgpu_kernel void @scalar_not_i32(ptr addrspace(1) %out, i32 %a) { define amdgpu_kernel void @vector_not_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: vector_not_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -339,7 +339,7 @@ define amdgpu_kernel void @vector_not_i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: vector_not_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -360,31 +360,31 @@ define amdgpu_kernel void @vector_not_i32(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @vector_xor_i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: vector_xor_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s14, s10 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_xor_b32_e32 v0, v2, v0 ; SI-NEXT: v_xor_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: vector_xor_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -409,8 +409,8 @@ define amdgpu_kernel void @vector_xor_i64(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @scalar_xor_i64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; SI-LABEL: scalar_xor_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -424,8 +424,8 @@ define amdgpu_kernel void @scalar_xor_i64(ptr addrspace(1) %out, i64 %a, i64 %b) ; ; VI-LABEL: scalar_xor_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_xor_b64 s[0:1], s[6:7], s[0:1] @@ -442,7 +442,7 @@ define amdgpu_kernel void @scalar_xor_i64(ptr addrspace(1) %out, i64 %a, i64 %b) define amdgpu_kernel void @scalar_not_i64(ptr addrspace(1) %out, i64 %a) { ; SI-LABEL: scalar_not_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -456,7 +456,7 @@ define amdgpu_kernel void @scalar_not_i64(ptr addrspace(1) %out, i64 %a) { ; ; VI-LABEL: scalar_not_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -473,7 +473,7 @@ define amdgpu_kernel void @scalar_not_i64(ptr addrspace(1) %out, i64 %a) { define amdgpu_kernel void @vector_not_i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: vector_not_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -492,7 +492,7 @@ define amdgpu_kernel void @vector_not_i64(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: vector_not_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -514,7 +514,7 @@ define amdgpu_kernel void @vector_not_i64(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @xor_cf(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %a, i64 %b) { ; SI-LABEL: xor_cf: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b64 s[8:9], 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u64_e64 s[10:11], s[4:5], 0 @@ -545,7 +545,7 @@ define amdgpu_kernel void @xor_cf(ptr addrspace(1) %out, ptr addrspace(1) %in, i ; ; VI-LABEL: xor_cf: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b64 s[8:9], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u64 s[4:5], 0 @@ -591,8 +591,8 @@ endif: define amdgpu_kernel void @scalar_xor_literal_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) { ; SI-LABEL: scalar_xor_literal_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -605,15 +605,15 @@ define amdgpu_kernel void @scalar_xor_literal_i64(ptr addrspace(1) %out, [8 x i3 ; ; VI-LABEL: scalar_xor_literal_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_xor_b32 s1, s1, 0xf237b -; VI-NEXT: s_xor_b32 s0, s0, 0x3039 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_xor_b32 s3, s3, 0xf237b +; VI-NEXT: s_xor_b32 s2, s2, 0x3039 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm %or = xor i64 %a, 4261135838621753 @@ -624,30 +624,30 @@ define amdgpu_kernel void @scalar_xor_literal_i64(ptr addrspace(1) %out, [8 x i3 define amdgpu_kernel void @scalar_xor_literal_multi_use_i64(ptr addrspace(1) %out, [8 x i32], i64 %a, i64 %b) { ; SI-LABEL: scalar_xor_literal_multi_use_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x13 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x13 ; SI-NEXT: s_movk_i32 s8, 0x3039 ; SI-NEXT: s_mov_b32 s9, 0xf237b -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; SI-NEXT: s_add_u32 s0, s6, 0x3039 -; SI-NEXT: s_addc_u32 s1, s7, 0xf237b +; SI-NEXT: s_xor_b64 s[0:1], s[0:1], s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: s_add_u32 s0, s2, 0x3039 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_addc_u32 s1, s3, 0xf237b ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_endpgm ; ; VI-LABEL: scalar_xor_literal_multi_use_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_movk_i32 s2, 0x3039 ; VI-NEXT: s_mov_b32 s3, 0xf237b ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -675,8 +675,8 @@ define amdgpu_kernel void @scalar_xor_literal_multi_use_i64(ptr addrspace(1) %ou define amdgpu_kernel void @scalar_xor_inline_imm_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) { ; SI-LABEL: scalar_xor_inline_imm_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -688,14 +688,14 @@ define amdgpu_kernel void @scalar_xor_inline_imm_i64(ptr addrspace(1) %out, [8 x ; ; VI-LABEL: scalar_xor_inline_imm_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_xor_b32 s0, s0, 63 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_xor_b32 s2, s2, 63 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm %or = xor i64 %a, 63 @@ -706,8 +706,8 @@ define amdgpu_kernel void @scalar_xor_inline_imm_i64(ptr addrspace(1) %out, [8 x define amdgpu_kernel void @scalar_xor_neg_inline_imm_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) { ; SI-LABEL: scalar_xor_neg_inline_imm_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -719,14 +719,14 @@ define amdgpu_kernel void @scalar_xor_neg_inline_imm_i64(ptr addrspace(1) %out, ; ; VI-LABEL: scalar_xor_neg_inline_imm_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_xor_b64 s[0:1], s[0:1], -8 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_xor_b64 s[2:3], s[2:3], -8 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm %or = xor i64 %a, -8 @@ -737,7 +737,7 @@ define amdgpu_kernel void @scalar_xor_neg_inline_imm_i64(ptr addrspace(1) %out, define amdgpu_kernel void @vector_xor_i64_neg_inline_imm(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { ; SI-LABEL: vector_xor_i64_neg_inline_imm: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -756,7 +756,7 @@ define amdgpu_kernel void @vector_xor_i64_neg_inline_imm(ptr addrspace(1) %out, ; ; VI-LABEL: vector_xor_i64_neg_inline_imm: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -777,7 +777,7 @@ define amdgpu_kernel void @vector_xor_i64_neg_inline_imm(ptr addrspace(1) %out, define amdgpu_kernel void @vector_xor_literal_i64(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { ; SI-LABEL: vector_xor_literal_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -796,7 +796,7 @@ define amdgpu_kernel void @vector_xor_literal_i64(ptr addrspace(1) %out, ptr add ; ; VI-LABEL: vector_xor_literal_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 From d3c12b1b4b6c44977ff5f1d0d6a70d72c4d951cf Mon Sep 17 00:00:00 2001 From: Christudasan Devadasan Date: Thu, 20 Jun 2024 15:02:07 +0000 Subject: [PATCH 2/8] Used hasXnackReplay call to check if xnack feature is enabled. --- .../Target/AMDGPU/SILoadStoreOptimizer.cpp | 17 +- llvm/test/CodeGen/AMDGPU/merge-s-load.mir | 180 ++++-------------- 2 files changed, 42 insertions(+), 155 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index 0b285d52b539e..f1c9f19346c7d 100644 --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -216,8 +216,7 @@ class SILoadStoreOptimizer : public MachineFunctionPass { CombineInfo &Paired, bool Modify = false); static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI, const CombineInfo &Paired); - static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired, - const GCNSubtarget *STI = nullptr); + unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired); static std::pair getSubRegIdxs(const CombineInfo &CI, const CombineInfo &Paired); const TargetRegisterClass * @@ -344,7 +343,6 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: case AMDGPU::S_LOAD_DWORD_IMM: - case AMDGPU::S_LOAD_DWORD_IMM_ec: case AMDGPU::GLOBAL_LOAD_DWORD: case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: case AMDGPU::GLOBAL_STORE_DWORD: @@ -513,7 +511,6 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { case AMDGPU::S_LOAD_DWORDX3_IMM: case AMDGPU::S_LOAD_DWORDX4_IMM: case AMDGPU::S_LOAD_DWORDX8_IMM: - case AMDGPU::S_LOAD_DWORD_IMM_ec: case AMDGPU::S_LOAD_DWORDX2_IMM_ec: case AMDGPU::S_LOAD_DWORDX3_IMM_ec: case AMDGPU::S_LOAD_DWORDX4_IMM_ec: @@ -602,7 +599,6 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) { case AMDGPU::S_LOAD_DWORDX3_IMM: case AMDGPU::S_LOAD_DWORDX4_IMM: case AMDGPU::S_LOAD_DWORDX8_IMM: - case AMDGPU::S_LOAD_DWORD_IMM_ec: case AMDGPU::S_LOAD_DWORDX2_IMM_ec: case AMDGPU::S_LOAD_DWORDX3_IMM_ec: case AMDGPU::S_LOAD_DWORDX4_IMM_ec: @@ -719,7 +715,6 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) { case AMDGPU::S_LOAD_DWORDX3_IMM: case AMDGPU::S_LOAD_DWORDX4_IMM: case AMDGPU::S_LOAD_DWORDX8_IMM: - case AMDGPU::S_LOAD_DWORD_IMM_ec: case AMDGPU::S_LOAD_DWORDX2_IMM_ec: case AMDGPU::S_LOAD_DWORDX3_IMM_ec: case AMDGPU::S_LOAD_DWORDX4_IMM_ec: @@ -1476,7 +1471,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair( MachineBasicBlock::iterator InsertBefore) { MachineBasicBlock *MBB = CI.I->getParent(); DebugLoc DL = CI.I->getDebugLoc(); - const unsigned Opcode = getNewOpcode(CI, Paired, STM); + const unsigned Opcode = getNewOpcode(CI, Paired); const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); @@ -1688,8 +1683,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair( } unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, - const CombineInfo &Paired, - const GCNSubtarget *STI) { + const CombineInfo &Paired) { const unsigned Width = CI.Width + Paired.Width; switch (getCommonInstClass(CI, Paired)) { @@ -1732,8 +1726,9 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM; } case S_LOAD_IMM: - // For targets that support XNACK replay, use the constrained load opcode. - if (STI && STI->hasXnackReplay()) { + // Use the constrained opcodes when the subtarget has the XNACK support + // enabled. + if (STM->isXNACKEnabled()) { switch (Width) { default: return 0; diff --git a/llvm/test/CodeGen/AMDGPU/merge-s-load.mir b/llvm/test/CodeGen/AMDGPU/merge-s-load.mir index a87503c731c7a..b08da2e1848ff 100644 --- a/llvm/test/CodeGen/AMDGPU/merge-s-load.mir +++ b/llvm/test/CodeGen/AMDGPU/merge-s-load.mir @@ -8,9 +8,9 @@ body: | bb.0: ; CHECK-LABEL: name: merge_s_load_x1_x1 ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF - ; CHECK-NEXT: early-clobber %3:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s64), align 4) - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32_xm0_xexec = COPY %3.sub0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed %3.sub1 + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s64), align 4) + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_LOAD_DWORDX2_IMM]].sub0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_LOAD_DWORDX2_IMM]].sub1 %0:sgpr_64 = IMPLICIT_DEF %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s32)) %2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 4, 0 :: (dereferenceable invariant load (s32)) @@ -48,16 +48,16 @@ body: | bb.0: ; GFX11-LABEL: name: merge_s_load_x1_x1_x1 ; GFX11: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF - ; GFX11-NEXT: early-clobber %4:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s64), align 4) - ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32_xm0_xexec = COPY %4.sub0 - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed %4.sub1 + ; GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s64), align 4) + ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_LOAD_DWORDX2_IMM]].sub0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_LOAD_DWORDX2_IMM]].sub1 ; GFX11-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 8, 0 :: (dereferenceable invariant load (s32)) ; ; GFX12-LABEL: name: merge_s_load_x1_x1_x1 ; GFX12: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF - ; GFX12-NEXT: early-clobber %5:sgpr_96 = S_LOAD_DWORDX3_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s96), align 4) - ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_64_xexec = COPY %5.sub0_sub1 - ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed %5.sub2 + ; GFX12-NEXT: [[S_LOAD_DWORDX3_IMM:%[0-9]+]]:sgpr_96 = S_LOAD_DWORDX3_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s96), align 4) + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_64_xexec = COPY [[S_LOAD_DWORDX3_IMM]].sub0_sub1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_LOAD_DWORDX3_IMM]].sub2 ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY]].sub0 ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY]].sub1 %0:sgpr_64 = IMPLICIT_DEF @@ -72,9 +72,9 @@ body: | bb.0: ; GFX11-LABEL: name: merge_s_load_x1_x1_x1_x1 ; GFX11: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF - ; GFX11-NEXT: early-clobber %7:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64_xexec = COPY %7.sub0_sub1 - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY killed %7.sub2_sub3 + ; GFX11-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64_xexec = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY killed [[S_LOAD_DWORDX4_IMM]].sub2_sub3 ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY]].sub0 ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY]].sub1 ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY1]].sub0 @@ -82,9 +82,9 @@ body: | ; ; GFX12-LABEL: name: merge_s_load_x1_x1_x1_x1 ; GFX12: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF - ; GFX12-NEXT: early-clobber %7:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_96 = COPY %7.sub0_sub1_sub2 - ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed %7.sub3 + ; GFX12-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_96 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1_sub2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_LOAD_DWORDX4_IMM]].sub3 ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_64_xexec = COPY [[COPY]].sub0_sub1 ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY]].sub2 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY2]].sub0 @@ -102,9 +102,9 @@ body: | bb.0: ; GFX11-LABEL: name: merge_s_load_x1_x1_x1_x1_x1_x1_x1_x1 ; GFX11: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF - ; GFX11-NEXT: early-clobber %15:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 4) - ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %15.sub0_sub1_sub2_sub3 - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed %15.sub4_sub5_sub6_sub7 + ; GFX11-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 4) + ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed [[S_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7 ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_64_xexec = COPY [[COPY]].sub0_sub1 ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY killed [[COPY]].sub2_sub3 ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY2]].sub0 @@ -120,9 +120,9 @@ body: | ; ; GFX12-LABEL: name: merge_s_load_x1_x1_x1_x1_x1_x1_x1_x1 ; GFX12: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF - ; GFX12-NEXT: early-clobber %15:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 4) - ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %15.sub0_sub1_sub2_sub3 - ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed %15.sub4_sub5_sub6_sub7 + ; GFX12-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 4) + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed [[S_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7 ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_96 = COPY [[COPY]].sub0_sub1_sub2 ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY]].sub3 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_64_xexec = COPY [[COPY2]].sub0_sub1 @@ -157,9 +157,9 @@ body: | ; ; GFX12-LABEL: name: merge_s_load_x2_x1 ; GFX12: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF - ; GFX12-NEXT: early-clobber %3:sgpr_96 = S_LOAD_DWORDX3_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s96), align 8) - ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY %3.sub0_sub1 - ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed %3.sub2 + ; GFX12-NEXT: [[S_LOAD_DWORDX3_IMM:%[0-9]+]]:sgpr_96 = S_LOAD_DWORDX3_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s96), align 8) + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY [[S_LOAD_DWORDX3_IMM]].sub0_sub1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_LOAD_DWORDX3_IMM]].sub2 %0:sgpr_64 = IMPLICIT_DEF %1:sgpr_64 = S_LOAD_DWORDX2_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s64)) %2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 8, 0 :: (dereferenceable invariant load (s32)) @@ -171,9 +171,9 @@ body: | bb.0: ; CHECK-LABEL: name: merge_s_load_x2_x2 ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF - ; CHECK-NEXT: early-clobber %3:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 8) - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY %3.sub0_sub1 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_64 = COPY killed %3.sub2_sub3 + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 8) + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_64 = COPY killed [[S_LOAD_DWORDX4_IMM]].sub2_sub3 %0:sgpr_64 = IMPLICIT_DEF %1:sgpr_64 = S_LOAD_DWORDX2_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s64)) %2:sgpr_64 = S_LOAD_DWORDX2_IMM %0:sgpr_64, 8, 0 :: (dereferenceable invariant load (s64)) @@ -185,9 +185,9 @@ body: | bb.0: ; CHECK-LABEL: name: merge_s_load_x2_x2_x2_x2 ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF - ; CHECK-NEXT: early-clobber %7:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 8) - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %7.sub0_sub1_sub2_sub3 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed %7.sub4_sub5_sub6_sub7 + ; CHECK-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 8) + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed [[S_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_64 = COPY [[COPY]].sub0_sub1 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY killed [[COPY]].sub2_sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY [[COPY1]].sub0_sub1 @@ -205,9 +205,9 @@ body: | bb.0: ; CHECK-LABEL: name: merge_s_load_x3_x1 ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF - ; CHECK-NEXT: early-clobber %3:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s128)) - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_96 = COPY %3.sub0_sub1_sub2 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed %3.sub3 + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s128)) + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_96 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1_sub2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_LOAD_DWORDX4_IMM]].sub3 %0:sgpr_64 = IMPLICIT_DEF %1:sgpr_96 = S_LOAD_DWORDX3_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s96)) %2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 12, 0 :: (dereferenceable invariant load (s32)) @@ -219,118 +219,10 @@ body: | bb.0: ; CHECK-LABEL: name: merge_s_load_x4_x4 ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF - ; CHECK-NEXT: early-clobber %3:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 16) - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %3.sub0_sub1_sub2_sub3 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed %3.sub4_sub5_sub6_sub7 + ; CHECK-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 16) + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed [[S_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7 %0:sgpr_64 = IMPLICIT_DEF %1:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s128)) %2:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 16, 0 :: (dereferenceable invariant load (s128)) ... - -# The constrained multi-dword scalar load merge tests. ---- -name: merge_s_load_x1_x2ec -body: | - bb.0: - ; CHECK-LABEL: name: merge_s_load_x1_x2ec - ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF - ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: early-clobber %2:sgpr_64 = S_LOAD_DWORDX2_IMM_ec [[DEF]], 4, 0 :: (dereferenceable invariant load (s64)) - %0:sgpr_64 = IMPLICIT_DEF - %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s32)) - early-clobber %2:sgpr_64 = S_LOAD_DWORDX2_IMM_ec %0:sgpr_64, 4, 0 :: (dereferenceable invariant load (s64)) -... - ---- -name: merge_s_load_x1_x3ec -body: | - bb.0: - ; CHECK-LABEL: name: merge_s_load_x1_x3ec - ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF - ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: early-clobber %2:sgpr_96 = S_LOAD_DWORDX3_IMM_ec [[DEF]], 4, 0 :: (dereferenceable invariant load (s96), align 16) - %0:sgpr_64 = IMPLICIT_DEF - %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s32)) - early-clobber %2:sgpr_96 = S_LOAD_DWORDX3_IMM_ec %0:sgpr_64, 4, 0 :: (dereferenceable invariant load (s96)) -... - ---- -name: merge_s_load_x2ec_x1 -body: | - bb.0: - ; GFX11-LABEL: name: merge_s_load_x2ec_x1 - ; GFX11: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF - ; GFX11-NEXT: early-clobber %1:sgpr_64 = S_LOAD_DWORDX2_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s64)) - ; GFX11-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 8, 0 :: (dereferenceable invariant load (s32)) - ; - ; GFX12-LABEL: name: merge_s_load_x2ec_x1 - ; GFX12: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF - ; GFX12-NEXT: early-clobber %3:sgpr_96 = S_LOAD_DWORDX3_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s96), align 8) - ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY %3.sub0_sub1 - ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed %3.sub2 - %0:sgpr_64 = IMPLICIT_DEF - early-clobber %1:sgpr_64 = S_LOAD_DWORDX2_IMM_ec %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s64)) - %2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 8, 0 :: (dereferenceable invariant load (s32)) -... - ---- -name: merge_s_load_x2ec_x2ec -body: | - bb.0: - ; CHECK-LABEL: name: merge_s_load_x2ec_x2ec - ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF - ; CHECK-NEXT: early-clobber %3:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 8) - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY %3.sub0_sub1 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_64 = COPY killed %3.sub2_sub3 - %0:sgpr_64 = IMPLICIT_DEF - early-clobber %1:sgpr_64 = S_LOAD_DWORDX2_IMM_ec %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s64)) - early-clobber %2:sgpr_64 = S_LOAD_DWORDX2_IMM_ec %0:sgpr_64, 8, 0 :: (dereferenceable invariant load (s64)) -... - ---- -name: merge_s_load_x2ec_x2ec_x2ec_x2ec -body: | - bb.0: - ; CHECK-LABEL: name: merge_s_load_x2ec_x2ec_x2ec_x2ec - ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF - ; CHECK-NEXT: early-clobber %7:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 8) - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %7.sub0_sub1_sub2_sub3 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed %7.sub4_sub5_sub6_sub7 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_64 = COPY [[COPY]].sub0_sub1 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY killed [[COPY]].sub2_sub3 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY [[COPY1]].sub0_sub1 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY killed [[COPY1]].sub2_sub3 - %0:sgpr_64 = IMPLICIT_DEF - early-clobber %1:sgpr_64 = S_LOAD_DWORDX2_IMM_ec %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s64)) - early-clobber %2:sgpr_64 = S_LOAD_DWORDX2_IMM_ec %0:sgpr_64, 8, 0 :: (dereferenceable invariant load (s64)) - early-clobber %3:sgpr_64 = S_LOAD_DWORDX2_IMM_ec %0:sgpr_64, 16, 0 :: (dereferenceable invariant load (s64)) - early-clobber %4:sgpr_64 = S_LOAD_DWORDX2_IMM_ec %0:sgpr_64, 24, 0 :: (dereferenceable invariant load (s64)) -... - ---- -name: merge_s_load_x3ec_x1 -body: | - bb.0: - ; CHECK-LABEL: name: merge_s_load_x3ec_x1 - ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF - ; CHECK-NEXT: early-clobber %3:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s128)) - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_96 = COPY %3.sub0_sub1_sub2 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed %3.sub3 - %0:sgpr_64 = IMPLICIT_DEF - early-clobber %1:sgpr_96 = S_LOAD_DWORDX3_IMM_ec %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s96)) - %2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 12, 0 :: (dereferenceable invariant load (s32)) -... - ---- -name: merge_s_load_x4ec_x4ec -body: | - bb.0: - ; CHECK-LABEL: name: merge_s_load_x4ec_x4ec - ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF - ; CHECK-NEXT: early-clobber %3:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 16) - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %3.sub0_sub1_sub2_sub3 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed %3.sub4_sub5_sub6_sub7 - %0:sgpr_64 = IMPLICIT_DEF - early-clobber %1:sgpr_128 = S_LOAD_DWORDX4_IMM_ec %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s128)) - early-clobber %2:sgpr_128 = S_LOAD_DWORDX4_IMM_ec %0:sgpr_64, 16, 0 :: (dereferenceable invariant load (s128)) -... From feeaf70e0fe29f0cb42efb2653699d891f0fc75d Mon Sep 17 00:00:00 2001 From: Christudasan Devadasan Date: Tue, 25 Jun 2024 05:21:48 +0000 Subject: [PATCH 3/8] take the alignment into consideration. --- .../Target/AMDGPU/SILoadStoreOptimizer.cpp | 52 +-- .../AMDGPU/GlobalISel/fp-atomics-gfx940.ll | 12 +- .../GlobalISel/llvm.amdgcn.update.dpp.ll | 28 +- .../test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll | 152 +++---- .../test/CodeGen/AMDGPU/GlobalISel/udivrem.ll | 84 ++-- llvm/test/CodeGen/AMDGPU/add.v2i16.ll | 14 +- .../AMDGPU/amdgpu-codegenprepare-idiv.ll | 408 +++++++++--------- llvm/test/CodeGen/AMDGPU/build_vector.ll | 12 +- llvm/test/CodeGen/AMDGPU/cluster_stores.ll | 88 ++-- .../CodeGen/AMDGPU/combine-cond-add-sub.ll | 20 +- llvm/test/CodeGen/AMDGPU/ctlz.ll | 180 ++++---- llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll | 158 +++---- llvm/test/CodeGen/AMDGPU/cttz.ll | 184 ++++---- llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll | 126 +++--- llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll | 262 +++++------ .../AMDGPU/divergence-driven-buildvector.ll | 32 +- llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll | 20 +- llvm/test/CodeGen/AMDGPU/fdiv.ll | 86 ++-- llvm/test/CodeGen/AMDGPU/flat_atomics.ll | 20 +- .../CodeGen/AMDGPU/flat_atomics_i32_system.ll | 56 +-- llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll | 180 ++++---- llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll | 12 +- llvm/test/CodeGen/AMDGPU/fshl.ll | 42 +- llvm/test/CodeGen/AMDGPU/fshr.ll | 14 +- llvm/test/CodeGen/AMDGPU/global_atomics.ll | 16 +- .../AMDGPU/global_atomics_i32_system.ll | 96 ++--- .../insert_waitcnt_for_precise_memory.ll | 160 +++---- .../CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll | 14 +- .../CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll | 32 +- .../CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll | 24 +- .../CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll | 16 +- .../CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll | 16 +- .../CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll | 16 +- .../CodeGen/AMDGPU/llvm.amdgcn.permlane.ll | 112 ++--- .../AMDGPU/llvm.amdgcn.sched.group.barrier.ll | 244 +++++------ llvm/test/CodeGen/AMDGPU/llvm.exp.ll | 48 +-- llvm/test/CodeGen/AMDGPU/llvm.exp10.ll | 48 +-- llvm/test/CodeGen/AMDGPU/llvm.exp2.ll | 12 +- llvm/test/CodeGen/AMDGPU/llvm.log2.ll | 12 +- llvm/test/CodeGen/AMDGPU/llvm.round.ll | 76 ++-- llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll | 14 +- llvm/test/CodeGen/AMDGPU/madak.ll | 104 ++--- llvm/test/CodeGen/AMDGPU/memory_clause.ll | 18 +- .../CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll | 160 +++---- llvm/test/CodeGen/AMDGPU/mul_int24.ll | 40 +- llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll | 52 +-- llvm/test/CodeGen/AMDGPU/packed-op-sel.ll | 72 ++-- .../AMDGPU/ptr-buffer-alias-scheduling.ll | 36 +- llvm/test/CodeGen/AMDGPU/rotl.ll | 8 +- llvm/test/CodeGen/AMDGPU/rotr.ll | 6 +- llvm/test/CodeGen/AMDGPU/shl.v2i16.ll | 30 +- llvm/test/CodeGen/AMDGPU/sub.ll | 8 +- llvm/test/CodeGen/AMDGPU/sub.v2i16.ll | 30 +- llvm/test/CodeGen/AMDGPU/v_cndmask.ll | 10 +- llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll | 34 +- llvm/test/CodeGen/AMDGPU/wave32.ll | 54 +-- 56 files changed, 1923 insertions(+), 1937 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index f1c9f19346c7d..7bdc494ff545d 100644 --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -1725,36 +1725,30 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, case 8: return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM; } - case S_LOAD_IMM: - // Use the constrained opcodes when the subtarget has the XNACK support - // enabled. - if (STM->isXNACKEnabled()) { - switch (Width) { - default: - return 0; - case 2: - return AMDGPU::S_LOAD_DWORDX2_IMM_ec; - case 3: - return AMDGPU::S_LOAD_DWORDX3_IMM_ec; - case 4: - return AMDGPU::S_LOAD_DWORDX4_IMM_ec; - case 8: - return AMDGPU::S_LOAD_DWORDX8_IMM_ec; - } - } else { - switch (Width) { - default: - return 0; - case 2: - return AMDGPU::S_LOAD_DWORDX2_IMM; - case 3: - return AMDGPU::S_LOAD_DWORDX3_IMM; - case 4: - return AMDGPU::S_LOAD_DWORDX4_IMM; - case 8: - return AMDGPU::S_LOAD_DWORDX8_IMM; - } + case S_LOAD_IMM: { + // If XNACK is enabled, use the constrained opcodes when the first load is + // under-aligned. + const MachineMemOperand *MMO = *CI.I->memoperands_begin(); + auto NeedsConstrainedOpc = [&MMO, Width](const GCNSubtarget &ST) { + return ST.isXNACKEnabled() && MMO->getAlign().value() < Width; + }; + switch (Width) { + default: + return 0; + case 2: + return NeedsConstrainedOpc(*STM) ? AMDGPU::S_LOAD_DWORDX2_IMM_ec + : AMDGPU::S_LOAD_DWORDX2_IMM; + case 3: + return NeedsConstrainedOpc(*STM) ? AMDGPU::S_LOAD_DWORDX3_IMM_ec + : AMDGPU::S_LOAD_DWORDX3_IMM; + case 4: + return NeedsConstrainedOpc(*STM) ? AMDGPU::S_LOAD_DWORDX4_IMM_ec + : AMDGPU::S_LOAD_DWORDX4_IMM; + case 8: + return NeedsConstrainedOpc(*STM) ? AMDGPU::S_LOAD_DWORDX8_IMM_ec + : AMDGPU::S_LOAD_DWORDX8_IMM; } + } case GLOBAL_LOAD: switch (Width) { default: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll index a018ea5bf18f1..fff341b07881b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll @@ -159,10 +159,10 @@ define <2 x i16> @global_atomic_fadd_v2bf16_rtn(ptr addrspace(1) %ptr, <2 x i16> define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, <2 x half> %data) { ; GFX940-LABEL: local_atomic_fadd_v2f16_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s2 -; GFX940-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NEXT: v_mov_b32_e32 v1, s1 ; GFX940-NEXT: ds_pk_add_f16 v0, v1 ; GFX940-NEXT: s_endpgm %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0) @@ -183,10 +183,10 @@ define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half> define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, <2 x i16> %data) { ; GFX940-LABEL: local_atomic_fadd_v2bf16_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s3 -; GFX940-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-NEXT: v_mov_b32_e32 v1, s0 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: ds_pk_add_bf16 v1, v0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll index 1092bb4dc834a..727184a36c006 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll @@ -19,13 +19,13 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) { ; ; GFX10-LABEL: dpp_test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: dpp_test: @@ -174,16 +174,16 @@ define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32> ; ; GFX10-LABEL: update_dppv2i32_test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] -; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: v_mov_b32_e32 v3, s7 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 -; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: update_dppv2i32_test: @@ -229,16 +229,16 @@ define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x floa ; ; GFX10-LABEL: update_dppv2f32_test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] -; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: v_mov_b32_e32 v3, s7 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 -; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: update_dppv2f32_test: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll index 8a2274cbfbf62..b666f45521661 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll @@ -692,121 +692,121 @@ define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX9-LABEL: sdivrem_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s0, s14, 31 -; GFX9-NEXT: s_add_i32 s1, s14, s0 -; GFX9-NEXT: s_xor_b32 s1, s1, s0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 -; GFX9-NEXT: s_ashr_i32 s2, s15, 31 -; GFX9-NEXT: s_add_i32 s3, s15, s2 -; GFX9-NEXT: s_xor_b32 s3, s3, s2 +; GFX9-NEXT: s_ashr_i32 s8, s6, 31 +; GFX9-NEXT: s_add_i32 s6, s6, s8 +; GFX9-NEXT: s_xor_b32 s6, s6, s8 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX9-NEXT: s_ashr_i32 s9, s7, 31 +; GFX9-NEXT: s_add_i32 s7, s7, s9 +; GFX9-NEXT: s_xor_b32 s7, s7, s9 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GFX9-NEXT: s_sub_i32 s6, 0, s1 -; GFX9-NEXT: s_ashr_i32 s4, s12, 31 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 +; GFX9-NEXT: s_sub_i32 s12, 0, s6 +; GFX9-NEXT: s_ashr_i32 s10, s4, 31 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: s_sub_i32 s7, 0, s3 -; GFX9-NEXT: s_ashr_i32 s5, s13, 31 -; GFX9-NEXT: v_mul_lo_u32 v2, s6, v0 +; GFX9-NEXT: s_add_i32 s4, s4, s10 +; GFX9-NEXT: s_xor_b32 s4, s4, s10 +; GFX9-NEXT: v_mul_lo_u32 v2, s12, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: s_add_i32 s6, s12, s4 +; GFX9-NEXT: s_sub_i32 s12, 0, s7 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX9-NEXT: s_xor_b32 s6, s6, s4 -; GFX9-NEXT: v_mul_lo_u32 v3, s7, v1 -; GFX9-NEXT: s_add_i32 s7, s13, s5 +; GFX9-NEXT: s_ashr_i32 s11, s5, 31 +; GFX9-NEXT: v_mul_lo_u32 v3, s12, v1 +; GFX9-NEXT: s_add_i32 s5, s5, s11 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v0, s6, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX9-NEXT: v_mul_hi_u32 v2, v1, v3 -; GFX9-NEXT: s_xor_b32 s7, s7, s5 -; GFX9-NEXT: s_xor_b32 s0, s4, s0 -; GFX9-NEXT: v_mul_lo_u32 v3, v0, s1 +; GFX9-NEXT: s_xor_b32 s5, s5, s11 +; GFX9-NEXT: v_mul_lo_u32 v3, v0, s6 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 ; GFX9-NEXT: v_add_u32_e32 v2, 1, v0 -; GFX9-NEXT: v_mul_hi_u32 v1, s7, v1 -; GFX9-NEXT: v_sub_u32_e32 v3, s6, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s1, v3 +; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 +; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_subrev_u32_e32 v2, s1, v3 +; GFX9-NEXT: v_subrev_u32_e32 v2, s6, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s1, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s1, v2 +; GFX9-NEXT: v_subrev_u32_e32 v3, s6, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, v1, s3 +; GFX9-NEXT: v_mul_lo_u32 v3, v1, s7 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 -; GFX9-NEXT: v_xor_b32_e32 v0, s0, v0 -; GFX9-NEXT: v_subrev_u32_e32 v0, s0, v0 -; GFX9-NEXT: v_sub_u32_e32 v3, s7, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 +; GFX9-NEXT: s_xor_b32 s4, s10, s8 +; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 +; GFX9-NEXT: v_sub_u32_e32 v3, s5, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s3, v3 +; GFX9-NEXT: v_subrev_u32_e32 v4, s7, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 +; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s3, v3 -; GFX9-NEXT: s_xor_b32 s0, s5, s2 +; GFX9-NEXT: v_subrev_u32_e32 v4, s7, v3 +; GFX9-NEXT: s_xor_b32 s4, s11, s9 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX9-NEXT: v_xor_b32_e32 v1, s0, v1 -; GFX9-NEXT: v_xor_b32_e32 v2, s4, v2 -; GFX9-NEXT: v_subrev_u32_e32 v1, s0, v1 -; GFX9-NEXT: v_xor_b32_e32 v3, s5, v3 +; GFX9-NEXT: v_xor_b32_e32 v1, s4, v1 +; GFX9-NEXT: v_xor_b32_e32 v2, s10, v2 +; GFX9-NEXT: v_subrev_u32_e32 v1, s4, v1 +; GFX9-NEXT: v_xor_b32_e32 v3, s11, v3 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_subrev_u32_e32 v2, s4, v2 -; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v3 -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9] -; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11] +; GFX9-NEXT: v_subrev_u32_e32 v2, s10, v2 +; GFX9-NEXT: v_subrev_u32_e32 v3, s11, v3 +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: sdivrem_v2i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_ashr_i32 s1, s14, 31 -; GFX10-NEXT: s_ashr_i32 s2, s15, 31 -; GFX10-NEXT: s_add_i32 s0, s14, s1 -; GFX10-NEXT: s_add_i32 s3, s15, s2 -; GFX10-NEXT: s_xor_b32 s4, s0, s1 +; GFX10-NEXT: s_ashr_i32 s1, s10, 31 +; GFX10-NEXT: s_ashr_i32 s2, s11, 31 +; GFX10-NEXT: s_add_i32 s0, s10, s1 +; GFX10-NEXT: s_add_i32 s3, s11, s2 +; GFX10-NEXT: s_xor_b32 s10, s0, s1 ; GFX10-NEXT: s_xor_b32 s3, s3, s2 -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s10 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GFX10-NEXT: s_sub_i32 s0, 0, s4 -; GFX10-NEXT: s_sub_i32 s5, 0, s3 -; GFX10-NEXT: s_ashr_i32 s6, s13, 31 +; GFX10-NEXT: s_sub_i32 s0, 0, s10 +; GFX10-NEXT: s_sub_i32 s11, 0, s3 +; GFX10-NEXT: s_ashr_i32 s12, s9, 31 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX10-NEXT: s_add_i32 s7, s13, s6 -; GFX10-NEXT: s_xor_b32 s7, s7, s6 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_lo_u32 v2, s0, v0 -; GFX10-NEXT: v_mul_lo_u32 v3, s5, v1 -; GFX10-NEXT: s_ashr_i32 s5, s12, 31 -; GFX10-NEXT: s_add_i32 s0, s12, s5 -; GFX10-NEXT: s_xor_b32 s1, s5, s1 -; GFX10-NEXT: s_xor_b32 s0, s0, s5 +; GFX10-NEXT: v_mul_lo_u32 v3, s11, v1 +; GFX10-NEXT: s_ashr_i32 s11, s8, 31 +; GFX10-NEXT: s_add_i32 s0, s8, s11 +; GFX10-NEXT: s_add_i32 s8, s9, s12 +; GFX10-NEXT: s_xor_b32 s0, s0, s11 +; GFX10-NEXT: s_xor_b32 s8, s8, s12 ; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 +; GFX10-NEXT: s_xor_b32 s1, s11, s1 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 ; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX10-NEXT: v_mul_hi_u32 v1, s7, v1 -; GFX10-NEXT: v_mul_lo_u32 v2, v0, s4 +; GFX10-NEXT: v_mul_hi_u32 v1, s8, v1 +; GFX10-NEXT: v_mul_lo_u32 v2, v0, s10 ; GFX10-NEXT: v_mul_lo_u32 v3, v1, s3 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 ; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, s0, v2 -; GFX10-NEXT: v_sub_nc_u32_e32 v3, s7, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s4, v2 +; GFX10-NEXT: v_sub_nc_u32_e32 v3, s8, v3 +; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s10, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s3, v3 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s4, v2 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s10, v2 ; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s3, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 @@ -814,26 +814,26 @@ define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s4, v2 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s10, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s3, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s4, v2 +; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s10, v2 ; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s3, v3 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo -; GFX10-NEXT: s_xor_b32 s0, s6, s2 +; GFX10-NEXT: s_xor_b32 s0, s12, s2 ; GFX10-NEXT: v_xor_b32_e32 v0, s1, v0 ; GFX10-NEXT: v_xor_b32_e32 v1, s0, v1 -; GFX10-NEXT: v_xor_b32_e32 v2, s5, v2 -; GFX10-NEXT: v_xor_b32_e32 v3, s6, v3 +; GFX10-NEXT: v_xor_b32_e32 v2, s11, v2 +; GFX10-NEXT: v_xor_b32_e32 v3, s12, v3 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_subrev_nc_u32_e32 v0, s1, v0 ; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s0, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s5, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s6, v3 -; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9] -; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11] +; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s11, v2 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s12, v3 +; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7] ; GFX10-NEXT: s_endpgm %div = sdiv <2 x i32> %x, %y store <2 x i32> %div, ptr addrspace(1) %out0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll index 62d8b7d6f045c..a58397eccaba7 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll @@ -576,12 +576,12 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX9-LABEL: udivrem_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s14 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s15 -; GFX9-NEXT: s_sub_i32 s0, 0, s14 -; GFX9-NEXT: s_sub_i32 s1, 0, s15 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s10 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s11 +; GFX9-NEXT: s_sub_i32 s0, 0, s10 +; GFX9-NEXT: s_sub_i32 s1, 0, s11 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -593,47 +593,47 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v0, s12, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, s8, v0 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 -; GFX9-NEXT: v_mul_hi_u32 v1, s13, v1 -; GFX9-NEXT: v_mul_lo_u32 v2, v0, s14 +; GFX9-NEXT: v_mul_hi_u32 v1, s9, v1 +; GFX9-NEXT: v_mul_lo_u32 v2, v0, s10 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 -; GFX9-NEXT: v_mul_lo_u32 v3, v1, s15 +; GFX9-NEXT: v_mul_lo_u32 v3, v1, s11 ; GFX9-NEXT: v_add_u32_e32 v5, 1, v1 -; GFX9-NEXT: v_sub_u32_e32 v2, s12, v2 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s14, v2 -; GFX9-NEXT: v_sub_u32_e32 v3, s13, v3 +; GFX9-NEXT: v_sub_u32_e32 v2, s8, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 +; GFX9-NEXT: v_sub_u32_e32 v3, s9, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s14, v2 -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v3 +; GFX9-NEXT: v_subrev_u32_e32 v4, s10, v2 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1] -; GFX9-NEXT: v_subrev_u32_e32 v5, s15, v3 +; GFX9-NEXT: v_subrev_u32_e32 v5, s11, v3 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s14, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s14, v2 +; GFX9-NEXT: v_subrev_u32_e32 v4, s10, v2 ; GFX9-NEXT: v_add_u32_e32 v5, 1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s15, v3 -; GFX9-NEXT: v_subrev_u32_e32 v4, s15, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 +; GFX9-NEXT: v_subrev_u32_e32 v4, s11, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9] -; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11] +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: udivrem_v2i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s14 -; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s15 -; GFX10-NEXT: s_sub_i32 s0, 0, s14 -; GFX10-NEXT: s_sub_i32 s1, 0, s15 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s10 +; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s11 +; GFX10-NEXT: s_sub_i32 s0, 0, s10 +; GFX10-NEXT: s_sub_i32 s1, 0, s11 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -646,34 +646,34 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 -; GFX10-NEXT: v_mul_hi_u32 v0, s12, v0 -; GFX10-NEXT: v_mul_hi_u32 v1, s13, v1 -; GFX10-NEXT: v_mul_lo_u32 v2, v0, s14 -; GFX10-NEXT: v_mul_lo_u32 v3, v1, s15 +; GFX10-NEXT: v_mul_hi_u32 v0, s8, v0 +; GFX10-NEXT: v_mul_hi_u32 v1, s9, v1 +; GFX10-NEXT: v_mul_lo_u32 v2, v0, s10 +; GFX10-NEXT: v_mul_lo_u32 v3, v1, s11 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 ; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, s12, v2 -; GFX10-NEXT: v_sub_nc_u32_e32 v3, s13, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s14, v2 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s15, v3 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s14, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s15, v3 +; GFX10-NEXT: v_sub_nc_u32_e32 v2, s8, v2 +; GFX10-NEXT: v_sub_nc_u32_e32 v3, s9, v3 +; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s10, v2 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s11, v3 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s10, v2 +; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s11, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s14, v2 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s15, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s14, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s15, v3 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s10, v2 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s11, v3 +; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s10, v2 +; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s11, v3 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo -; GFX10-NEXT: global_store_dwordx2 v8, v[0:1], s[8:9] -; GFX10-NEXT: global_store_dwordx2 v8, v[2:3], s[10:11] +; GFX10-NEXT: global_store_dwordx2 v8, v[0:1], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v8, v[2:3], s[6:7] ; GFX10-NEXT: s_endpgm %div = udiv <2 x i32> %x, %y store <2 x i32> %div, ptr addrspace(1) %out0 diff --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll index 611a7b566070c..6f67ce4de9ce5 100644 --- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll @@ -243,21 +243,21 @@ define amdgpu_kernel void @s_test_add_v2i16_kernarg(ptr addrspace(1) %out, <2 x ; ; GFX9-LABEL: s_test_add_v2i16_kernarg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_pk_add_u16 v1, s6, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_pk_add_u16 v1, s2, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_test_add_v2i16_kernarg: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_add_u16 v1, s6, s7 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: v_pk_add_u16 v1, s2, s3 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_add_v2i16_kernarg: diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll index 559871d162e13..8144fb7a3b646 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -72,31 +72,31 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX9-LABEL: udiv_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX9-NEXT: s_sub_i32 s0, 0, s7 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-NEXT: s_sub_i32 s4, 0, s3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s1, v0 -; GFX9-NEXT: s_mul_i32 s0, s0, s1 -; GFX9-NEXT: s_mul_hi_u32 s0, s1, s0 -; GFX9-NEXT: s_add_i32 s1, s1, s0 -; GFX9-NEXT: s_mul_hi_u32 s0, s6, s1 -; GFX9-NEXT: s_mul_i32 s1, s0, s7 -; GFX9-NEXT: s_sub_i32 s1, s6, s1 -; GFX9-NEXT: s_add_i32 s2, s0, 1 -; GFX9-NEXT: s_sub_i32 s3, s1, s7 -; GFX9-NEXT: s_cmp_ge_u32 s1, s7 -; GFX9-NEXT: s_cselect_b32 s0, s2, s0 -; GFX9-NEXT: s_cselect_b32 s1, s3, s1 -; GFX9-NEXT: s_add_i32 s2, s0, 1 -; GFX9-NEXT: s_cmp_ge_u32 s1, s7 -; GFX9-NEXT: s_cselect_b32 s0, s2, s0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-NEXT: v_readfirstlane_b32 s5, v0 +; GFX9-NEXT: s_mul_i32 s4, s4, s5 +; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4 +; GFX9-NEXT: s_add_i32 s5, s5, s4 +; GFX9-NEXT: s_mul_hi_u32 s4, s2, s5 +; GFX9-NEXT: s_mul_i32 s5, s4, s3 +; GFX9-NEXT: s_sub_i32 s2, s2, s5 +; GFX9-NEXT: s_add_i32 s6, s4, 1 +; GFX9-NEXT: s_sub_i32 s5, s2, s3 +; GFX9-NEXT: s_cmp_ge_u32 s2, s3 +; GFX9-NEXT: s_cselect_b32 s4, s6, s4 +; GFX9-NEXT: s_cselect_b32 s2, s5, s2 +; GFX9-NEXT: s_add_i32 s5, s4, 1 +; GFX9-NEXT: s_cmp_ge_u32 s2, s3 +; GFX9-NEXT: s_cselect_b32 s2, s5, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm %r = udiv i32 %x, %y store i32 %r, ptr addrspace(1) %out @@ -167,29 +167,29 @@ define amdgpu_kernel void @urem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX9-LABEL: urem_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX9-NEXT: s_sub_i32 s0, 0, s7 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-NEXT: s_sub_i32 s4, 0, s3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s1, v0 -; GFX9-NEXT: s_mul_i32 s0, s0, s1 -; GFX9-NEXT: s_mul_hi_u32 s0, s1, s0 -; GFX9-NEXT: s_add_i32 s1, s1, s0 -; GFX9-NEXT: s_mul_hi_u32 s0, s6, s1 -; GFX9-NEXT: s_mul_i32 s0, s0, s7 -; GFX9-NEXT: s_sub_i32 s0, s6, s0 -; GFX9-NEXT: s_sub_i32 s1, s0, s7 -; GFX9-NEXT: s_cmp_ge_u32 s0, s7 -; GFX9-NEXT: s_cselect_b32 s0, s1, s0 -; GFX9-NEXT: s_sub_i32 s1, s0, s7 -; GFX9-NEXT: s_cmp_ge_u32 s0, s7 -; GFX9-NEXT: s_cselect_b32 s0, s1, s0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-NEXT: v_readfirstlane_b32 s5, v0 +; GFX9-NEXT: s_mul_i32 s4, s4, s5 +; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4 +; GFX9-NEXT: s_add_i32 s5, s5, s4 +; GFX9-NEXT: s_mul_hi_u32 s4, s2, s5 +; GFX9-NEXT: s_mul_i32 s4, s4, s3 +; GFX9-NEXT: s_sub_i32 s2, s2, s4 +; GFX9-NEXT: s_sub_i32 s4, s2, s3 +; GFX9-NEXT: s_cmp_ge_u32 s2, s3 +; GFX9-NEXT: s_cselect_b32 s2, s4, s2 +; GFX9-NEXT: s_sub_i32 s4, s2, s3 +; GFX9-NEXT: s_cmp_ge_u32 s2, s3 +; GFX9-NEXT: s_cselect_b32 s2, s4, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm %r = urem i32 %x, %y store i32 %r, ptr addrspace(1) %out @@ -280,37 +280,37 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX9-LABEL: sdiv_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_abs_i32 s0, s7 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GFX9-NEXT: s_xor_b32 s1, s6, s7 -; GFX9-NEXT: s_abs_i32 s2, s6 -; GFX9-NEXT: s_sub_i32 s3, 0, s0 +; GFX9-NEXT: s_abs_i32 s4, s3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX9-NEXT: s_sub_i32 s5, 0, s4 +; GFX9-NEXT: s_xor_b32 s3, s2, s3 +; GFX9-NEXT: s_abs_i32 s2, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_ashr_i32 s1, s1, 31 +; GFX9-NEXT: s_ashr_i32 s3, s3, 31 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s6, v0 -; GFX9-NEXT: s_mul_i32 s3, s3, s6 -; GFX9-NEXT: s_mul_hi_u32 s3, s6, s3 -; GFX9-NEXT: s_add_i32 s6, s6, s3 -; GFX9-NEXT: s_mul_hi_u32 s3, s2, s6 -; GFX9-NEXT: s_mul_i32 s6, s3, s0 +; GFX9-NEXT: s_mul_i32 s5, s5, s6 +; GFX9-NEXT: s_mul_hi_u32 s5, s6, s5 +; GFX9-NEXT: s_add_i32 s6, s6, s5 +; GFX9-NEXT: s_mul_hi_u32 s5, s2, s6 +; GFX9-NEXT: s_mul_i32 s6, s5, s4 ; GFX9-NEXT: s_sub_i32 s2, s2, s6 -; GFX9-NEXT: s_add_i32 s7, s3, 1 -; GFX9-NEXT: s_sub_i32 s6, s2, s0 -; GFX9-NEXT: s_cmp_ge_u32 s2, s0 -; GFX9-NEXT: s_cselect_b32 s3, s7, s3 +; GFX9-NEXT: s_add_i32 s7, s5, 1 +; GFX9-NEXT: s_sub_i32 s6, s2, s4 +; GFX9-NEXT: s_cmp_ge_u32 s2, s4 +; GFX9-NEXT: s_cselect_b32 s5, s7, s5 ; GFX9-NEXT: s_cselect_b32 s2, s6, s2 -; GFX9-NEXT: s_add_i32 s6, s3, 1 -; GFX9-NEXT: s_cmp_ge_u32 s2, s0 -; GFX9-NEXT: s_cselect_b32 s0, s6, s3 -; GFX9-NEXT: s_xor_b32 s0, s0, s1 -; GFX9-NEXT: s_sub_i32 s0, s0, s1 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-NEXT: s_add_i32 s6, s5, 1 +; GFX9-NEXT: s_cmp_ge_u32 s2, s4 +; GFX9-NEXT: s_cselect_b32 s2, s6, s5 +; GFX9-NEXT: s_xor_b32 s2, s2, s3 +; GFX9-NEXT: s_sub_i32 s2, s2, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm %r = sdiv i32 %x, %y store i32 %r, ptr addrspace(1) %out @@ -394,34 +394,34 @@ define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX9-LABEL: srem_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_abs_i32 s0, s7 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GFX9-NEXT: s_ashr_i32 s1, s6, 31 -; GFX9-NEXT: s_abs_i32 s2, s6 -; GFX9-NEXT: s_sub_i32 s3, 0, s0 +; GFX9-NEXT: s_abs_i32 s3, s3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-NEXT: s_sub_i32 s5, 0, s3 +; GFX9-NEXT: s_ashr_i32 s4, s2, 31 +; GFX9-NEXT: s_abs_i32 s2, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s6, v0 -; GFX9-NEXT: s_mul_i32 s3, s3, s6 -; GFX9-NEXT: s_mul_hi_u32 s3, s6, s3 -; GFX9-NEXT: s_add_i32 s6, s6, s3 -; GFX9-NEXT: s_mul_hi_u32 s3, s2, s6 -; GFX9-NEXT: s_mul_i32 s3, s3, s0 -; GFX9-NEXT: s_sub_i32 s2, s2, s3 -; GFX9-NEXT: s_sub_i32 s3, s2, s0 -; GFX9-NEXT: s_cmp_ge_u32 s2, s0 -; GFX9-NEXT: s_cselect_b32 s2, s3, s2 -; GFX9-NEXT: s_sub_i32 s3, s2, s0 -; GFX9-NEXT: s_cmp_ge_u32 s2, s0 -; GFX9-NEXT: s_cselect_b32 s0, s3, s2 -; GFX9-NEXT: s_xor_b32 s0, s0, s1 -; GFX9-NEXT: s_sub_i32 s0, s0, s1 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-NEXT: s_mul_i32 s5, s5, s6 +; GFX9-NEXT: s_mul_hi_u32 s5, s6, s5 +; GFX9-NEXT: s_add_i32 s6, s6, s5 +; GFX9-NEXT: s_mul_hi_u32 s5, s2, s6 +; GFX9-NEXT: s_mul_i32 s5, s5, s3 +; GFX9-NEXT: s_sub_i32 s2, s2, s5 +; GFX9-NEXT: s_sub_i32 s5, s2, s3 +; GFX9-NEXT: s_cmp_ge_u32 s2, s3 +; GFX9-NEXT: s_cselect_b32 s2, s5, s2 +; GFX9-NEXT: s_sub_i32 s5, s2, s3 +; GFX9-NEXT: s_cmp_ge_u32 s2, s3 +; GFX9-NEXT: s_cselect_b32 s2, s5, s2 +; GFX9-NEXT: s_xor_b32 s2, s2, s4 +; GFX9-NEXT: s_sub_i32 s2, s2, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm %r = srem i32 %x, %y store i32 %r, ptr addrspace(1) %out @@ -5482,13 +5482,13 @@ define amdgpu_kernel void @udiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; ; GFX9-LABEL: udiv_i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_i32 s0, s7, 12 -; GFX9-NEXT: s_lshr_b32 s0, s6, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: s_add_i32 s3, s3, 12 +; GFX9-NEXT: s_lshr_b32 s2, s2, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm %shl.y = shl i32 4096, %y %r = udiv i32 %x, %shl.y @@ -5524,14 +5524,14 @@ define amdgpu_kernel void @udiv_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3 ; ; GFX9-LABEL: udiv_v2i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s0, s6, 12 -; GFX9-NEXT: s_lshr_b32 s1, s7, 12 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: s_lshr_b32 s2, s2, 12 +; GFX9-NEXT: s_lshr_b32 s3, s3, 12 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %r = udiv <2 x i32> %x, store <2 x i32> %r, ptr addrspace(1) %out @@ -5570,18 +5570,18 @@ define amdgpu_kernel void @udiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out, < ; ; GFX9-LABEL: udiv_v2i32_mixed_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_hi_u32 s1, s7, 0x100101 -; GFX9-NEXT: s_sub_i32 s2, s7, s1 -; GFX9-NEXT: s_lshr_b32 s2, s2, 1 -; GFX9-NEXT: s_add_i32 s2, s2, s1 -; GFX9-NEXT: s_lshr_b32 s0, s6, 12 -; GFX9-NEXT: s_lshr_b32 s1, s2, 11 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: s_mul_hi_u32 s4, s3, 0x100101 +; GFX9-NEXT: s_sub_i32 s3, s3, s4 +; GFX9-NEXT: s_lshr_b32 s3, s3, 1 +; GFX9-NEXT: s_add_i32 s3, s3, s4 +; GFX9-NEXT: s_lshr_b32 s2, s2, 12 +; GFX9-NEXT: s_lshr_b32 s3, s3, 11 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %r = udiv <2 x i32> %x, store <2 x i32> %r, ptr addrspace(1) %out @@ -5875,14 +5875,14 @@ define amdgpu_kernel void @urem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; ; GFX9-LABEL: urem_i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s0, 0x1000, s7 -; GFX9-NEXT: s_add_i32 s0, s0, -1 -; GFX9-NEXT: s_and_b32 s0, s6, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3 +; GFX9-NEXT: s_add_i32 s3, s3, -1 +; GFX9-NEXT: s_and_b32 s2, s2, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm %shl.y = shl i32 4096, %y %r = urem i32 %x, %shl.y @@ -5918,14 +5918,14 @@ define amdgpu_kernel void @urem_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3 ; ; GFX9-LABEL: urem_v2i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, s6, 0xfff -; GFX9-NEXT: s_and_b32 s1, s7, 0xfff -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: s_and_b32 s2, s2, 0xfff +; GFX9-NEXT: s_and_b32 s3, s3, 0xfff +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %r = urem <2 x i32> %x, store <2 x i32> %r, ptr addrspace(1) %out @@ -6234,41 +6234,41 @@ define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; ; GFX9-LABEL: sdiv_i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s0, 0x1000, s7 -; GFX9-NEXT: s_ashr_i32 s1, s0, 31 -; GFX9-NEXT: s_add_i32 s0, s0, s1 -; GFX9-NEXT: s_xor_b32 s0, s0, s1 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GFX9-NEXT: s_ashr_i32 s2, s6, 31 -; GFX9-NEXT: s_add_i32 s3, s6, s2 -; GFX9-NEXT: s_sub_i32 s6, 0, s0 +; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3 +; GFX9-NEXT: s_ashr_i32 s4, s3, 31 +; GFX9-NEXT: s_add_i32 s3, s3, s4 +; GFX9-NEXT: s_xor_b32 s3, s3, s4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-NEXT: s_sub_i32 s6, 0, s3 +; GFX9-NEXT: s_ashr_i32 s5, s2, 31 +; GFX9-NEXT: s_add_i32 s2, s2, s5 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_xor_b32 s3, s3, s2 +; GFX9-NEXT: s_xor_b32 s2, s2, s5 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s7, v0 ; GFX9-NEXT: s_mul_i32 s6, s6, s7 ; GFX9-NEXT: s_mul_hi_u32 s6, s7, s6 ; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_mul_hi_u32 s6, s3, s7 -; GFX9-NEXT: s_mul_i32 s8, s6, s0 -; GFX9-NEXT: s_sub_i32 s3, s3, s8 +; GFX9-NEXT: s_mul_hi_u32 s6, s2, s7 +; GFX9-NEXT: s_mul_i32 s8, s6, s3 +; GFX9-NEXT: s_sub_i32 s2, s2, s8 ; GFX9-NEXT: s_add_i32 s7, s6, 1 -; GFX9-NEXT: s_sub_i32 s8, s3, s0 -; GFX9-NEXT: s_cmp_ge_u32 s3, s0 +; GFX9-NEXT: s_sub_i32 s8, s2, s3 +; GFX9-NEXT: s_cmp_ge_u32 s2, s3 ; GFX9-NEXT: s_cselect_b32 s6, s7, s6 -; GFX9-NEXT: s_cselect_b32 s3, s8, s3 +; GFX9-NEXT: s_cselect_b32 s2, s8, s2 ; GFX9-NEXT: s_add_i32 s7, s6, 1 -; GFX9-NEXT: s_cmp_ge_u32 s3, s0 -; GFX9-NEXT: s_cselect_b32 s0, s7, s6 -; GFX9-NEXT: s_xor_b32 s1, s2, s1 -; GFX9-NEXT: s_xor_b32 s0, s0, s1 -; GFX9-NEXT: s_sub_i32 s0, s0, s1 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-NEXT: s_cmp_ge_u32 s2, s3 +; GFX9-NEXT: s_cselect_b32 s2, s7, s6 +; GFX9-NEXT: s_xor_b32 s3, s5, s4 +; GFX9-NEXT: s_xor_b32 s2, s2, s3 +; GFX9-NEXT: s_sub_i32 s2, s2, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm %shl.y = shl i32 4096, %y %r = sdiv i32 %x, %shl.y @@ -6310,20 +6310,20 @@ define amdgpu_kernel void @sdiv_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3 ; ; GFX9-LABEL: sdiv_v2i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s0, s6, 31 -; GFX9-NEXT: s_ashr_i32 s1, s7, 31 -; GFX9-NEXT: s_lshr_b32 s0, s0, 20 -; GFX9-NEXT: s_lshr_b32 s1, s1, 20 -; GFX9-NEXT: s_add_i32 s0, s6, s0 -; GFX9-NEXT: s_add_i32 s1, s7, s1 -; GFX9-NEXT: s_ashr_i32 s0, s0, 12 -; GFX9-NEXT: s_ashr_i32 s1, s1, 12 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: s_ashr_i32 s4, s2, 31 +; GFX9-NEXT: s_ashr_i32 s5, s3, 31 +; GFX9-NEXT: s_lshr_b32 s4, s4, 20 +; GFX9-NEXT: s_lshr_b32 s5, s5, 20 +; GFX9-NEXT: s_add_i32 s2, s2, s4 +; GFX9-NEXT: s_add_i32 s3, s3, s5 +; GFX9-NEXT: s_ashr_i32 s2, s2, 12 +; GFX9-NEXT: s_ashr_i32 s3, s3, 12 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %r = sdiv <2 x i32> %x, store <2 x i32> %r, ptr addrspace(1) %out @@ -6365,21 +6365,21 @@ define amdgpu_kernel void @ssdiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out, ; ; GFX9-LABEL: ssdiv_v2i32_mixed_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s0, s6, 31 -; GFX9-NEXT: s_mul_hi_i32 s1, s7, 0x80080081 -; GFX9-NEXT: s_lshr_b32 s0, s0, 20 -; GFX9-NEXT: s_add_i32 s1, s1, s7 -; GFX9-NEXT: s_add_i32 s0, s6, s0 -; GFX9-NEXT: s_lshr_b32 s2, s1, 31 -; GFX9-NEXT: s_ashr_i32 s1, s1, 11 -; GFX9-NEXT: s_ashr_i32 s0, s0, 12 -; GFX9-NEXT: s_add_i32 s1, s1, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: s_ashr_i32 s4, s2, 31 +; GFX9-NEXT: s_mul_hi_i32 s5, s3, 0x80080081 +; GFX9-NEXT: s_lshr_b32 s4, s4, 20 +; GFX9-NEXT: s_add_i32 s5, s5, s3 +; GFX9-NEXT: s_add_i32 s2, s2, s4 +; GFX9-NEXT: s_lshr_b32 s3, s5, 31 +; GFX9-NEXT: s_ashr_i32 s4, s5, 11 +; GFX9-NEXT: s_ashr_i32 s2, s2, 12 +; GFX9-NEXT: s_add_i32 s4, s4, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %r = sdiv <2 x i32> %x, store <2 x i32> %r, ptr addrspace(1) %out @@ -6746,38 +6746,38 @@ define amdgpu_kernel void @srem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; ; GFX9-LABEL: srem_i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s0, 0x1000, s7 -; GFX9-NEXT: s_ashr_i32 s1, s0, 31 -; GFX9-NEXT: s_add_i32 s0, s0, s1 -; GFX9-NEXT: s_xor_b32 s0, s0, s1 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GFX9-NEXT: s_ashr_i32 s1, s6, 31 -; GFX9-NEXT: s_add_i32 s2, s6, s1 -; GFX9-NEXT: s_sub_i32 s3, 0, s0 +; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3 +; GFX9-NEXT: s_ashr_i32 s4, s3, 31 +; GFX9-NEXT: s_add_i32 s3, s3, s4 +; GFX9-NEXT: s_xor_b32 s3, s3, s4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-NEXT: s_sub_i32 s5, 0, s3 +; GFX9-NEXT: s_ashr_i32 s4, s2, 31 +; GFX9-NEXT: s_add_i32 s2, s2, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_xor_b32 s2, s2, s1 +; GFX9-NEXT: s_xor_b32 s2, s2, s4 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s6, v0 -; GFX9-NEXT: s_mul_i32 s3, s3, s6 -; GFX9-NEXT: s_mul_hi_u32 s3, s6, s3 -; GFX9-NEXT: s_add_i32 s6, s6, s3 -; GFX9-NEXT: s_mul_hi_u32 s3, s2, s6 -; GFX9-NEXT: s_mul_i32 s3, s3, s0 -; GFX9-NEXT: s_sub_i32 s2, s2, s3 -; GFX9-NEXT: s_sub_i32 s3, s2, s0 -; GFX9-NEXT: s_cmp_ge_u32 s2, s0 -; GFX9-NEXT: s_cselect_b32 s2, s3, s2 -; GFX9-NEXT: s_sub_i32 s3, s2, s0 -; GFX9-NEXT: s_cmp_ge_u32 s2, s0 -; GFX9-NEXT: s_cselect_b32 s0, s3, s2 -; GFX9-NEXT: s_xor_b32 s0, s0, s1 -; GFX9-NEXT: s_sub_i32 s0, s0, s1 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-NEXT: s_mul_i32 s5, s5, s6 +; GFX9-NEXT: s_mul_hi_u32 s5, s6, s5 +; GFX9-NEXT: s_add_i32 s6, s6, s5 +; GFX9-NEXT: s_mul_hi_u32 s5, s2, s6 +; GFX9-NEXT: s_mul_i32 s5, s5, s3 +; GFX9-NEXT: s_sub_i32 s2, s2, s5 +; GFX9-NEXT: s_sub_i32 s5, s2, s3 +; GFX9-NEXT: s_cmp_ge_u32 s2, s3 +; GFX9-NEXT: s_cselect_b32 s2, s5, s2 +; GFX9-NEXT: s_sub_i32 s5, s2, s3 +; GFX9-NEXT: s_cmp_ge_u32 s2, s3 +; GFX9-NEXT: s_cselect_b32 s2, s5, s2 +; GFX9-NEXT: s_xor_b32 s2, s2, s4 +; GFX9-NEXT: s_sub_i32 s2, s2, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm %shl.y = shl i32 4096, %y %r = srem i32 %x, %shl.y @@ -6821,22 +6821,22 @@ define amdgpu_kernel void @srem_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3 ; ; GFX9-LABEL: srem_v2i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s0, s6, 31 -; GFX9-NEXT: s_ashr_i32 s1, s7, 31 -; GFX9-NEXT: s_lshr_b32 s0, s0, 20 -; GFX9-NEXT: s_lshr_b32 s1, s1, 20 -; GFX9-NEXT: s_add_i32 s0, s6, s0 -; GFX9-NEXT: s_add_i32 s1, s7, s1 -; GFX9-NEXT: s_and_b32 s0, s0, 0xfffff000 -; GFX9-NEXT: s_and_b32 s1, s1, 0xfffff000 -; GFX9-NEXT: s_sub_i32 s0, s6, s0 -; GFX9-NEXT: s_sub_i32 s1, s7, s1 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: s_ashr_i32 s4, s2, 31 +; GFX9-NEXT: s_ashr_i32 s5, s3, 31 +; GFX9-NEXT: s_lshr_b32 s4, s4, 20 +; GFX9-NEXT: s_lshr_b32 s5, s5, 20 +; GFX9-NEXT: s_add_i32 s4, s2, s4 +; GFX9-NEXT: s_add_i32 s5, s3, s5 +; GFX9-NEXT: s_and_b32 s4, s4, 0xfffff000 +; GFX9-NEXT: s_sub_i32 s2, s2, s4 +; GFX9-NEXT: s_and_b32 s4, s5, 0xfffff000 +; GFX9-NEXT: s_sub_i32 s3, s3, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %r = srem <2 x i32> %x, store <2 x i32> %r, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/build_vector.ll b/llvm/test/CodeGen/AMDGPU/build_vector.ll index e914635d6c26f..b26d15ed3a1c8 100644 --- a/llvm/test/CodeGen/AMDGPU/build_vector.ll +++ b/llvm/test/CodeGen/AMDGPU/build_vector.ll @@ -316,14 +316,14 @@ define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out, ; ; GFX940-LABEL: build_v2i32_from_v4i16_shuffle: ; GFX940: ; %bb.0: ; %entry -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_lshl_b32 s0, s7, 16 -; GFX940-NEXT: s_lshl_b32 s1, s6, 16 -; GFX940-NEXT: v_mov_b32_e32 v0, s1 -; GFX940-NEXT: v_mov_b32_e32 v1, s0 -; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] sc0 sc1 +; GFX940-NEXT: s_lshl_b32 s3, s3, 16 +; GFX940-NEXT: s_lshl_b32 s2, s2, 16 +; GFX940-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_endpgm entry: %shuf = shufflevector <4 x i16> %in, <4 x i16> zeroinitializer, <2 x i32> diff --git a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll index d511bb1f4a257..b6948dab6bf9f 100644 --- a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll +++ b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll @@ -49,20 +49,20 @@ define amdgpu_kernel void @cluster_load_cluster_store(ptr noalias %lb, ptr noali ; ; GFX10-LABEL: cluster_load_cluster_store: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_u32 s0, s4, 8 -; GFX10-NEXT: s_addc_u32 s1, s5, 0 -; GFX10-NEXT: s_add_u32 s2, s4, 16 -; GFX10-NEXT: v_mov_b32_e32 v3, s1 -; GFX10-NEXT: s_addc_u32 s3, s5, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: s_add_u32 s0, s4, 24 -; GFX10-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-NEXT: s_addc_u32 s1, s5, 0 -; GFX10-NEXT: v_mov_b32_e32 v5, s3 -; GFX10-NEXT: v_mov_b32_e32 v4, s2 +; GFX10-NEXT: s_add_u32 s4, s0, 8 +; GFX10-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-NEXT: s_add_u32 s6, s0, 16 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: s_addc_u32 s7, s1, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_add_u32 s0, s0, 24 +; GFX10-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: v_mov_b32_e32 v3, s5 +; GFX10-NEXT: v_mov_b32_e32 v4, s6 +; GFX10-NEXT: v_mov_b32_e32 v5, s7 ; GFX10-NEXT: v_mov_b32_e32 v7, s1 ; GFX10-NEXT: v_mov_b32_e32 v6, s0 ; GFX10-NEXT: s_clause 0x3 @@ -70,16 +70,16 @@ define amdgpu_kernel void @cluster_load_cluster_store(ptr noalias %lb, ptr noali ; GFX10-NEXT: flat_load_dword v9, v[2:3] ; GFX10-NEXT: flat_load_dword v10, v[4:5] ; GFX10-NEXT: flat_load_dword v11, v[6:7] -; GFX10-NEXT: s_add_u32 s0, s6, 8 -; GFX10-NEXT: s_addc_u32 s1, s7, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: s_add_u32 s0, s2, 8 +; GFX10-NEXT: s_addc_u32 s1, s3, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: s_add_u32 s0, s6, 16 -; GFX10-NEXT: s_addc_u32 s1, s7, 0 -; GFX10-NEXT: s_add_u32 s2, s6, 24 -; GFX10-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-NEXT: s_addc_u32 s3, s7, 0 +; GFX10-NEXT: s_add_u32 s0, s2, 16 +; GFX10-NEXT: s_addc_u32 s1, s3, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: s_add_u32 s2, s2, 24 +; GFX10-NEXT: s_addc_u32 s3, s3, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, s1 ; GFX10-NEXT: v_mov_b32_e32 v4, s0 ; GFX10-NEXT: v_mov_b32_e32 v7, s3 @@ -175,20 +175,20 @@ define amdgpu_kernel void @cluster_load_valu_cluster_store(ptr noalias %lb, ptr ; ; GFX10-LABEL: cluster_load_valu_cluster_store: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_u32 s0, s4, 8 -; GFX10-NEXT: s_addc_u32 s1, s5, 0 -; GFX10-NEXT: s_add_u32 s2, s4, 16 -; GFX10-NEXT: v_mov_b32_e32 v3, s1 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: s_addc_u32 s3, s5, 0 -; GFX10-NEXT: s_add_u32 s0, s4, 24 -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: s_addc_u32 s1, s5, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-NEXT: v_mov_b32_e32 v5, s3 -; GFX10-NEXT: v_mov_b32_e32 v4, s2 +; GFX10-NEXT: s_add_u32 s4, s0, 8 +; GFX10-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-NEXT: s_add_u32 s6, s0, 16 +; GFX10-NEXT: v_mov_b32_e32 v3, s5 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: s_addc_u32 s7, s1, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_add_u32 s0, s0, 24 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, s6 +; GFX10-NEXT: v_mov_b32_e32 v5, s7 ; GFX10-NEXT: flat_load_dword v6, v[2:3] ; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 @@ -196,18 +196,18 @@ define amdgpu_kernel void @cluster_load_valu_cluster_store(ptr noalias %lb, ptr ; GFX10-NEXT: flat_load_dword v8, v[0:1] ; GFX10-NEXT: flat_load_dword v9, v[4:5] ; GFX10-NEXT: flat_load_dword v10, v[2:3] -; GFX10-NEXT: s_add_u32 s0, s6, 8 -; GFX10-NEXT: s_addc_u32 s1, s7, 0 -; GFX10-NEXT: s_add_u32 s2, s6, 16 +; GFX10-NEXT: s_add_u32 s0, s2, 8 +; GFX10-NEXT: s_addc_u32 s1, s3, 0 +; GFX10-NEXT: s_add_u32 s4, s2, 16 ; GFX10-NEXT: v_mov_b32_e32 v3, s1 -; GFX10-NEXT: s_addc_u32 s3, s7, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: s_addc_u32 s5, s3, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: s_add_u32 s0, s6, 24 -; GFX10-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-NEXT: v_mov_b32_e32 v5, s3 -; GFX10-NEXT: s_addc_u32 s1, s7, 0 -; GFX10-NEXT: v_mov_b32_e32 v4, s2 +; GFX10-NEXT: s_add_u32 s0, s2, 24 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: v_mov_b32_e32 v4, s4 +; GFX10-NEXT: s_addc_u32 s1, s3, 0 +; GFX10-NEXT: v_mov_b32_e32 v5, s5 ; GFX10-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) ; GFX10-NEXT: v_add_nc_u32_e32 v11, 1, v6 ; GFX10-NEXT: v_mov_b32_e32 v7, s1 diff --git a/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll b/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll index 9c7fa1537c0c2..33c0d90f94a39 100644 --- a/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll +++ b/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll @@ -665,17 +665,17 @@ define amdgpu_kernel void @sub_zext_setcc_commute(ptr addrspace(1) nocapture %ar ; ; GFX9-LABEL: sub_zext_setcc_commute: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v3, v2, s[4:5] +; GFX9-NEXT: global_load_dword v3, v2, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v3 -; GFX9-NEXT: v_add_u32_e32 v0, s6, v0 -; GFX9-NEXT: v_subrev_u32_e32 v0, s7, v0 -; GFX9-NEXT: global_store_dword v2, v0, s[4:5] +; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 +; GFX9-NEXT: v_subrev_u32_e32 v0, s3, v0 +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm bb: %x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -714,17 +714,17 @@ define amdgpu_kernel void @sub_sext_setcc_commute(ptr addrspace(1) nocapture %ar ; ; GFX9-LABEL: sub_sext_setcc_commute: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v3, v2, s[4:5] +; GFX9-NEXT: global_load_dword v3, v2, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v3 -; GFX9-NEXT: v_add_u32_e32 v0, s6, v0 -; GFX9-NEXT: v_subrev_u32_e32 v0, s7, v0 -; GFX9-NEXT: global_store_dword v2, v0, s[4:5] +; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 +; GFX9-NEXT: v_subrev_u32_e32 v0, s3, v0 +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm bb: %x = tail call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll index 3145ee1f6141e..4decf39d04013 100644 --- a/llvm/test/CodeGen/AMDGPU/ctlz.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll @@ -164,28 +164,28 @@ define amdgpu_kernel void @v_ctlz_i32(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX10-LABEL: v_ctlz_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[6:7] +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_ctlz_i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[6:7] +; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_ctlz_i32: @@ -277,32 +277,32 @@ define amdgpu_kernel void @v_ctlz_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX10-LABEL: v_ctlz_v2i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v1, v1 ; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX10-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_ctlz_v2i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] +; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_ctlz_v2i32: @@ -411,11 +411,11 @@ define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX10-LABEL: v_ctlz_v4i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7] +; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v3, v3 ; GFX10-NEXT: v_ffbh_u32_e32 v2, v2 @@ -425,16 +425,16 @@ define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrs ; GFX10-NEXT: v_min_u32_e32 v2, 32, v2 ; GFX10-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 -; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_ctlz_v4i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7] +; GFX10-GISEL-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 @@ -444,7 +444,7 @@ define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrs ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-GISEL-NEXT: v_min_u32_e32 v2, 32, v2 ; GFX10-GISEL-NEXT: v_min_u32_e32 v3, 32, v3 -; GFX10-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] +; GFX10-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_ctlz_v4i32: @@ -550,28 +550,28 @@ define amdgpu_kernel void @v_ctlz_i8(ptr addrspace(1) noalias %out, ptr addrspac ; ; GFX10-LABEL: v_ctlz_i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ubyte v1, v0, s[6:7] +; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v1, v1 ; GFX10-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 24, v1 -; GFX10-NEXT: global_store_byte v0, v1, s[4:5] +; GFX10-NEXT: global_store_byte v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_ctlz_i8: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] +; GFX10-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v1, 24, v1 -; GFX10-GISEL-NEXT: global_store_byte v0, v1, s[4:5] +; GFX10-GISEL-NEXT: global_store_byte v0, v1, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_ctlz_i8: @@ -737,24 +737,24 @@ define amdgpu_kernel void @s_ctlz_i64_trunc(ptr addrspace(1) noalias %out, i64 % ; ; GFX10-LABEL: s_ctlz_i64_trunc: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_flbit_i32_b64 s0, s[6:7] -; GFX10-NEXT: s_min_u32 s0, s0, 64 -; GFX10-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: s_flbit_i32_b64 s2, s[2:3] +; GFX10-NEXT: s_min_u32 s2, s2, 64 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: s_ctlz_i64_trunc: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: s_flbit_i32_b64 s0, s[6:7] -; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 64 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-GISEL-NEXT: s_flbit_i32_b64 s2, s[2:3] +; GFX10-GISEL-NEXT: s_min_u32 s2, s2, 64 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: s_ctlz_i64_trunc: @@ -847,25 +847,25 @@ define amdgpu_kernel void @v_ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX10-LABEL: v_ctlz_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX10-NEXT: v_ffbh_u32_e32 v1, v1 ; GFX10-NEXT: v_add_nc_u32_e64 v0, v0, 32 clamp ; GFX10-NEXT: v_min3_u32 v0, v0, v1, 64 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_ctlz_i64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] +; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 @@ -873,7 +873,7 @@ define amdgpu_kernel void @v_ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspa ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, v1, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 64, v0 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_ctlz_i64: @@ -974,33 +974,33 @@ define amdgpu_kernel void @v_ctlz_i64_trunc(ptr addrspace(1) noalias %out, ptr a ; ; GFX10-LABEL: v_ctlz_i64_trunc: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[1:2], v1, s[6:7] +; GFX10-NEXT: global_load_dwordx2 v[1:2], v1, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v1, v1 ; GFX10-NEXT: v_ffbh_u32_e32 v2, v2 ; GFX10-NEXT: v_add_nc_u32_e64 v1, v1, 32 clamp ; GFX10-NEXT: v_min3_u32 v1, v1, v2, 64 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_ctlz_i64_trunc: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dwordx2 v[1:2], v1, s[6:7] +; GFX10-GISEL-NEXT: global_load_dwordx2 v[1:2], v1, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v2, v2 ; GFX10-GISEL-NEXT: v_add_nc_u32_e64 v1, v1, 32 clamp ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, v2, v1 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 64, v1 -; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_ctlz_i64_trunc: @@ -1090,29 +1090,29 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-LABEL: v_ctlz_i32_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[6:7] +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_ctlz_i32_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[6:7] +; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc_lo ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_ctlz_i32_sel_eq_neg1: @@ -1197,29 +1197,29 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-LABEL: v_ctlz_i32_sel_ne_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[6:7] +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_ctlz_i32_sel_ne_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[6:7] +; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 ; GFX10-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v1, vcc_lo ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_ctlz_i32_sel_ne_neg1: @@ -1313,32 +1313,32 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_ctlz_i32_sel_eq_bitwidth: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[6:7] +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_ctlz_i32_sel_eq_bitwidth: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[6:7] +; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 32, v0 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_ctlz_i32_sel_eq_bitwidth: @@ -1435,32 +1435,32 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_ctlz_i32_sel_ne_bitwidth: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[6:7] +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_ctlz_i32_sel_ne_bitwidth: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[6:7] +; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX10-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 ; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_ctlz_i32_sel_ne_bitwidth: @@ -1552,22 +1552,22 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_ctlz_i8_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ubyte v0, v0, s[6:7] +; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 -; GFX10-NEXT: global_store_byte v1, v0, s[4:5] +; GFX10-NEXT: global_store_byte v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_ctlz_i8_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s3 ; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 ; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo ; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off @@ -1578,7 +1578,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v1, 24, v1 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc_lo ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[4:5] +; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_ctlz_i8_sel_eq_neg1: @@ -1674,25 +1674,25 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_ctlz_i16_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] +; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v2, v1 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 ; GFX10-NEXT: v_min_u32_e32 v2, 32, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v2, -16, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo -; GFX10-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_ctlz_i16_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_ushort v1, v0, s[6:7] +; GFX10-GISEL-NEXT: global_load_ushort v1, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v2, v1 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 @@ -1700,7 +1700,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v2, 16, v2 ; GFX10-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v2, 0xffff, vcc_lo -; GFX10-GISEL-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-GISEL-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_ctlz_i16_sel_eq_neg1: @@ -1795,23 +1795,23 @@ define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-LABEL: v_ctlz_i7_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ubyte v0, v0, s[6:7] +; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX10-NEXT: v_and_b32_e32 v0, 0x7f, v0 -; GFX10-NEXT: global_store_byte v1, v0, s[4:5] +; GFX10-NEXT: global_store_byte v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_ctlz_i7_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s3 ; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 ; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo ; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off @@ -1824,7 +1824,7 @@ define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0x7f, vcc_lo ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0x7f, v0 -; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[4:5] +; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_ctlz_i7_sel_eq_neg1: diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll index a377714ebf737..756b819099682 100644 --- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll @@ -134,14 +134,14 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[6:7] +; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid @@ -211,15 +211,15 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(ptr addrspace(1) noalias %out ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] +; GFX9-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 -; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr <2 x i32>, ptr addrspace(1) %valptr, i32 %tid @@ -295,17 +295,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(ptr addrspace(1) noalias %out ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v4i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7] +; GFX9-GISEL-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v2 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v3, v3 -; GFX9-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] +; GFX9-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr <4 x i32>, ptr addrspace(1) %valptr, i32 %tid @@ -562,14 +562,14 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i64_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-GISEL-NEXT: s_mov_b32 s1, 0 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_mov_b32 s5, 0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_flbit_i32_b64 s0, s[6:7] -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-GISEL-NEXT: s_flbit_i32_b64 s4, s[2:3] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %ctlz = tail call i64 @llvm.ctlz.i64(i64 %val, i1 true) nounwind readnone %ctlz_ret = icmp ne i64 %val, 0 @@ -650,17 +650,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i8_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] +; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v1 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v2 ; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc -; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[4:5] +; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %val = load i8, ptr addrspace(1) %arrayidx, align 1 %ctlz = tail call i8 @llvm.ctlz.i8(i8 %val, i1 true) nounwind readnone @@ -754,11 +754,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i16_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] -; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[6:7] offset:1 +; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v1 @@ -766,7 +766,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc -; GFX9-GISEL-NEXT: global_store_short v0, v1, s[4:5] +; GFX9-GISEL-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %val = load i16, ptr addrspace(1) %arrayidx, align 1 %ctlz = tail call i16 @llvm.ctlz.i16(i16 %val, i1 true) nounwind readnone @@ -870,13 +870,13 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] -; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[6:7] offset:1 -; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[6:7] offset:3 -; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[6:7] offset:2 +; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 +; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[2:3] offset:3 +; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[2:3] offset:2 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1) @@ -887,7 +887,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v1 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %val = load i32, ptr addrspace(1) %arrayidx, align 1 %ctlz = tail call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone @@ -1051,17 +1051,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v0, v1, s[6:7] -; GFX9-GISEL-NEXT: global_load_ubyte v2, v1, s[6:7] offset:1 -; GFX9-GISEL-NEXT: global_load_ubyte v3, v1, s[6:7] offset:2 -; GFX9-GISEL-NEXT: global_load_ubyte v4, v1, s[6:7] offset:3 -; GFX9-GISEL-NEXT: global_load_ubyte v5, v1, s[6:7] offset:4 -; GFX9-GISEL-NEXT: global_load_ubyte v6, v1, s[6:7] offset:5 -; GFX9-GISEL-NEXT: global_load_ubyte v7, v1, s[6:7] offset:6 -; GFX9-GISEL-NEXT: global_load_ubyte v8, v1, s[6:7] offset:7 +; GFX9-GISEL-NEXT: global_load_ubyte v0, v1, s[2:3] +; GFX9-GISEL-NEXT: global_load_ubyte v2, v1, s[2:3] offset:1 +; GFX9-GISEL-NEXT: global_load_ubyte v3, v1, s[2:3] offset:2 +; GFX9-GISEL-NEXT: global_load_ubyte v4, v1, s[2:3] offset:3 +; GFX9-GISEL-NEXT: global_load_ubyte v5, v1, s[2:3] offset:4 +; GFX9-GISEL-NEXT: global_load_ubyte v6, v1, s[2:3] offset:5 +; GFX9-GISEL-NEXT: global_load_ubyte v7, v1, s[2:3] offset:6 +; GFX9-GISEL-NEXT: global_load_ubyte v8, v1, s[2:3] offset:7 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(6) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v2, 8, v0 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(5) @@ -1082,7 +1082,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no ; GFX9-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; GFX9-GISEL-NEXT: v_min_u32_e32 v0, v4, v0 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 64, v0, vcc -; GFX9-GISEL-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] +; GFX9-GISEL-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %val = load i64, ptr addrspace(1) %arrayidx, align 1 %ctlz = tail call i64 @llvm.ctlz.i64(i64 %val, i1 true) nounwind readnone @@ -1159,11 +1159,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i8: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s3 ; GFX9-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v1, v0 ; GFX9-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v3, vcc ; GFX9-GISEL-NEXT: global_load_ubyte v0, v[0:1], off @@ -1171,7 +1171,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 -; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[4:5] +; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i8, ptr addrspace(1) %valptr, i32 %tid @@ -1283,12 +1283,12 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias ; ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i64_trunc: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_flbit_i32_b64 s0, s[6:7] -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-GISEL-NEXT: s_flbit_i32_b64 s2, s[2:3] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true) %trunc = trunc i64 %ctlz to i32 @@ -1365,17 +1365,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] +; GFX9-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 ; GFX9-GISEL-NEXT: v_add_u32_e32 v0, 32, v0 ; GFX9-GISEL-NEXT: v_min_u32_e32 v0, v1, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid @@ -1455,17 +1455,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64_trunc: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dwordx2 v[1:2], v1, s[6:7] +; GFX9-GISEL-NEXT: global_load_dwordx2 v[1:2], v1, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v2 ; GFX9-GISEL-NEXT: v_add_u32_e32 v1, 32, v1 ; GFX9-GISEL-NEXT: v_min_u32_e32 v1, v2, v1 -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid @@ -1535,16 +1535,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[6:7] +; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid @@ -1614,16 +1614,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[6:7] +; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v1, vcc ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid @@ -1698,11 +1698,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noa ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i8_sel_eq_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s3 ; GFX9-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v1, v0 ; GFX9-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v3, vcc ; GFX9-GISEL-NEXT: global_load_ubyte v0, v[0:1], off @@ -1710,9 +1710,9 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noa ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v0 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v2 -; GFX9-GISEL-NEXT: v_cmp_eq_u32_sdwa s[0:1], v0, v1 src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, -1, s[0:1] -; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[4:5] +; GFX9-GISEL-NEXT: v_cmp_eq_u32_sdwa s[2:3], v0, v1 src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, -1, s[2:3] +; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %valptr.gep = getelementptr i8, ptr addrspace(1) %valptr, i32 %tid @@ -1800,17 +1800,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(ptr addrspa ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1_two_use: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[6:7] +; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v0 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc ; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) @@ -1889,16 +1889,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(ptr addrspace(1) noali ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[6:7] +; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid @@ -1973,16 +1973,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(ptr addrspace(1) noali ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[6:7] +; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid @@ -2058,16 +2058,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(ptr addrspace(1 ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_cmp_non0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[6:7] +; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid @@ -2143,16 +2143,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(ptr addrspace(1 ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_cmp_non0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[6:7] +; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/cttz.ll b/llvm/test/CodeGen/AMDGPU/cttz.ll index ec532c8e4adc3..ee2894a66fbfc 100644 --- a/llvm/test/CodeGen/AMDGPU/cttz.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz.ll @@ -148,28 +148,28 @@ define amdgpu_kernel void @v_cttz_i32(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX10-LABEL: v_cttz_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[6:7] +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 ; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_cttz_i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[6:7] +; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid @@ -246,32 +246,32 @@ define amdgpu_kernel void @v_cttz_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX10-LABEL: v_cttz_v2i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbl_b32_e32 v1, v1 ; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 ; GFX10-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_cttz_v2i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] +; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr <2 x i32>, ptr addrspace(1) %valptr, i32 %tid @@ -362,11 +362,11 @@ define amdgpu_kernel void @v_cttz_v4i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX10-LABEL: v_cttz_v4i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7] +; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbl_b32_e32 v3, v3 ; GFX10-NEXT: v_ffbl_b32_e32 v2, v2 @@ -376,16 +376,16 @@ define amdgpu_kernel void @v_cttz_v4i32(ptr addrspace(1) noalias %out, ptr addrs ; GFX10-NEXT: v_min_u32_e32 v2, 32, v2 ; GFX10-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 -; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_cttz_v4i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7] +; GFX10-GISEL-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 @@ -395,7 +395,7 @@ define amdgpu_kernel void @v_cttz_v4i32(ptr addrspace(1) noalias %out, ptr addrs ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-GISEL-NEXT: v_min_u32_e32 v2, 32, v2 ; GFX10-GISEL-NEXT: v_min_u32_e32 v3, 32, v3 -; GFX10-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] +; GFX10-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr <4 x i32>, ptr addrspace(1) %valptr, i32 %tid @@ -475,26 +475,26 @@ define amdgpu_kernel void @v_cttz_i8(ptr addrspace(1) noalias %out, ptr addrspac ; ; GFX10-LABEL: v_cttz_i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ubyte v1, v0, s[6:7] +; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_or_b32_e32 v1, 0x100, v1 ; GFX10-NEXT: v_ffbl_b32_e32 v1, v1 -; GFX10-NEXT: global_store_byte v0, v1, s[4:5] +; GFX10-NEXT: global_store_byte v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_cttz_i8: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] +; GFX10-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_or_b32_e32 v1, 0x100, v1 ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 -; GFX10-GISEL-NEXT: global_store_byte v0, v1, s[4:5] +; GFX10-GISEL-NEXT: global_store_byte v0, v1, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %val = load i8, ptr addrspace(1) %valptr %cttz = call i8 @llvm.cttz.i8(i8 %val, i1 false) nounwind readnone @@ -629,24 +629,24 @@ define amdgpu_kernel void @s_cttz_i64_trunc(ptr addrspace(1) noalias %out, i64 % ; ; GFX10-LABEL: s_cttz_i64_trunc: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_ff1_i32_b64 s0, s[6:7] -; GFX10-NEXT: s_min_u32 s0, s0, 64 -; GFX10-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: s_ff1_i32_b64 s2, s[2:3] +; GFX10-NEXT: s_min_u32 s2, s2, 64 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: s_cttz_i64_trunc: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: s_ff1_i32_b64 s0, s[6:7] -; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 64 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-GISEL-NEXT: s_ff1_i32_b64 s2, s[2:3] +; GFX10-GISEL-NEXT: s_min_u32 s2, s2, 64 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %cttz = call i64 @llvm.cttz.i64(i64 %val, i1 false) %trunc = trunc i64 %cttz to i32 @@ -726,25 +726,25 @@ define amdgpu_kernel void @v_cttz_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX10-LABEL: v_cttz_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbl_b32_e32 v1, v1 ; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 ; GFX10-NEXT: v_add_nc_u32_e64 v1, v1, 32 clamp ; GFX10-NEXT: v_min3_u32 v0, v0, v1, 64 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_cttz_i64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] +; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 @@ -752,7 +752,7 @@ define amdgpu_kernel void @v_cttz_i64(ptr addrspace(1) noalias %out, ptr addrspa ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, v0, v1 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 64, v0 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid @@ -835,33 +835,33 @@ define amdgpu_kernel void @v_cttz_i64_trunc(ptr addrspace(1) noalias %out, ptr a ; ; GFX10-LABEL: v_cttz_i64_trunc: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[1:2], v1, s[6:7] +; GFX10-NEXT: global_load_dwordx2 v[1:2], v1, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbl_b32_e32 v2, v2 ; GFX10-NEXT: v_ffbl_b32_e32 v1, v1 ; GFX10-NEXT: v_add_nc_u32_e64 v2, v2, 32 clamp ; GFX10-NEXT: v_min3_u32 v1, v1, v2, 64 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_cttz_i64_trunc: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dwordx2 v[1:2], v1, s[6:7] +; GFX10-GISEL-NEXT: global_load_dwordx2 v[1:2], v1, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v2, v2 ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 ; GFX10-GISEL-NEXT: v_add_nc_u32_e64 v2, v2, 32 clamp ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, v1, v2 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 64, v1 -; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid @@ -933,29 +933,29 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-LABEL: v_cttz_i32_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[6:7] +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_cttz_i32_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[6:7] +; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v0 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc_lo ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid @@ -1027,29 +1027,29 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-LABEL: v_cttz_i32_sel_ne_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[6:7] +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_cttz_i32_sel_ne_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[6:7] +; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v0 ; GFX10-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v1, vcc_lo ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid @@ -1130,32 +1130,32 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_cttz_i32_sel_eq_bitwidth: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[6:7] +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 ; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_cttz_i32_sel_eq_bitwidth: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[6:7] +; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 32, v0 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid @@ -1235,32 +1235,32 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_cttz_i32_sel_ne_bitwidth: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[6:7] +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 ; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_cttz_i32_sel_ne_bitwidth: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[6:7] +; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX10-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 ; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid @@ -1335,32 +1335,32 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_cttz_i8_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ubyte v0, v0, s[6:7] +; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 -; GFX10-NEXT: global_store_byte v1, v0, s[4:5] +; GFX10-NEXT: global_store_byte v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_cttz_i8_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s3 ; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 ; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_or_b32_e32 v1, 0x100, v0 -; GFX10-GISEL-NEXT: v_cmp_eq_u32_sdwa s0, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-GISEL-NEXT: v_cmp_eq_u32_sdwa s2, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, s0 -; GFX10-GISEL-NEXT: global_store_byte v2, v0, s[4:5] +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, s2 +; GFX10-GISEL-NEXT: global_store_byte v2, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %valptr.gep = getelementptr i8, ptr addrspace(1) %valptr, i32 %tid @@ -1442,31 +1442,31 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_cttz_i16_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] +; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_or_b32_e32 v2, 0x10000, v1 ; GFX10-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0, v1 ; GFX10-NEXT: v_ffbl_b32_e32 v2, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo -; GFX10-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_cttz_i16_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_ushort v1, v0, s[6:7] +; GFX10-GISEL-NEXT: global_load_ushort v1, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_or_b32_e32 v2, 0x10000, v1 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v2, v2 ; GFX10-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v2, 0xffff, vcc_lo -; GFX10-GISEL-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-GISEL-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %val = load i16, ptr addrspace(1) %valptr %cttz = call i16 @llvm.cttz.i16(i16 %val, i1 false) nounwind readnone @@ -1542,23 +1542,23 @@ define amdgpu_kernel void @v_cttz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-LABEL: v_cttz_i7_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ubyte v0, v0, s[6:7] +; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 ; GFX10-NEXT: v_and_b32_e32 v0, 0x7f, v0 -; GFX10-NEXT: global_store_byte v1, v0, s[4:5] +; GFX10-NEXT: global_store_byte v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_cttz_i7_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s3 ; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 ; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo ; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off @@ -1570,7 +1570,7 @@ define amdgpu_kernel void @v_cttz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0x7f, vcc_lo ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0x7f, v0 -; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[4:5] +; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %valptr.gep = getelementptr i7, ptr addrspace(1) %valptr, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll index 086d99916ba04..392a44318b0a5 100644 --- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll @@ -121,14 +121,14 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[6:7] +; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid @@ -198,15 +198,15 @@ define amdgpu_kernel void @v_cttz_zero_undef_v2i32(ptr addrspace(1) noalias %out ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_v2i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] +; GFX9-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 -; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr <2 x i32>, ptr addrspace(1) %valptr, i32 %tid @@ -282,17 +282,17 @@ define amdgpu_kernel void @v_cttz_zero_undef_v4i32(ptr addrspace(1) noalias %out ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_v4i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7] +; GFX9-GISEL-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v2 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v3, v3 -; GFX9-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] +; GFX9-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr <4 x i32>, ptr addrspace(1) %valptr, i32 %tid @@ -538,14 +538,14 @@ define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: s_cttz_zero_undef_i64_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-GISEL-NEXT: s_mov_b32 s1, 0 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_mov_b32 s5, 0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_ff1_i32_b64 s0, s[6:7] -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-GISEL-NEXT: s_ff1_i32_b64 s4, s[2:3] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) nounwind readnone %cttz_ret = icmp ne i64 %val, 0 @@ -622,16 +622,16 @@ define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i8_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] +; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v1 ; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc -; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[4:5] +; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %val = load i8, ptr addrspace(1) %arrayidx, align 1 %cttz = tail call i8 @llvm.cttz.i8(i8 %val, i1 true) nounwind readnone @@ -721,18 +721,18 @@ define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i16_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] -; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[6:7] offset:1 +; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v1 ; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc -; GFX9-GISEL-NEXT: global_store_short v0, v1, s[4:5] +; GFX9-GISEL-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %val = load i16, ptr addrspace(1) %arrayidx, align 1 %cttz = tail call i16 @llvm.cttz.i16(i16 %val, i1 true) nounwind readnone @@ -836,13 +836,13 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i32_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] -; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[6:7] offset:1 -; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[6:7] offset:3 -; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[6:7] offset:2 +; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 +; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[2:3] offset:3 +; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[2:3] offset:2 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1) @@ -853,7 +853,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) no ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v1 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %val = load i32, ptr addrspace(1) %arrayidx, align 1 %cttz = tail call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone @@ -1017,17 +1017,17 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i64_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v0, v1, s[6:7] -; GFX9-GISEL-NEXT: global_load_ubyte v2, v1, s[6:7] offset:1 -; GFX9-GISEL-NEXT: global_load_ubyte v3, v1, s[6:7] offset:2 -; GFX9-GISEL-NEXT: global_load_ubyte v4, v1, s[6:7] offset:3 -; GFX9-GISEL-NEXT: global_load_ubyte v5, v1, s[6:7] offset:4 -; GFX9-GISEL-NEXT: global_load_ubyte v6, v1, s[6:7] offset:5 -; GFX9-GISEL-NEXT: global_load_ubyte v7, v1, s[6:7] offset:6 -; GFX9-GISEL-NEXT: global_load_ubyte v8, v1, s[6:7] offset:7 +; GFX9-GISEL-NEXT: global_load_ubyte v0, v1, s[2:3] +; GFX9-GISEL-NEXT: global_load_ubyte v2, v1, s[2:3] offset:1 +; GFX9-GISEL-NEXT: global_load_ubyte v3, v1, s[2:3] offset:2 +; GFX9-GISEL-NEXT: global_load_ubyte v4, v1, s[2:3] offset:3 +; GFX9-GISEL-NEXT: global_load_ubyte v5, v1, s[2:3] offset:4 +; GFX9-GISEL-NEXT: global_load_ubyte v6, v1, s[2:3] offset:5 +; GFX9-GISEL-NEXT: global_load_ubyte v7, v1, s[2:3] offset:6 +; GFX9-GISEL-NEXT: global_load_ubyte v8, v1, s[2:3] offset:7 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(6) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v2, 8, v0 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(5) @@ -1048,7 +1048,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no ; GFX9-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; GFX9-GISEL-NEXT: v_min_u32_e32 v0, v0, v4 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 64, v0, vcc -; GFX9-GISEL-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] +; GFX9-GISEL-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %val = load i64, ptr addrspace(1) %arrayidx, align 1 %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) nounwind readnone @@ -1152,13 +1152,13 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; GFX9-GISEL-LABEL: v_cttz_i32_sel_eq_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] -; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[6:7] offset:1 -; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[6:7] offset:3 -; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[6:7] offset:2 +; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 +; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[2:3] offset:3 +; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[2:3] offset:2 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1) @@ -1170,7 +1170,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; GFX9-GISEL-NEXT: v_min_u32_e32 v2, 32, v2 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, v2, -1, vcc -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %val = load i32, ptr addrspace(1) %arrayidx, align 1 %ctlz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone @@ -1274,13 +1274,13 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; ; GFX9-GISEL-LABEL: v_cttz_i32_sel_ne_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] -; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[6:7] offset:1 -; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[6:7] offset:3 -; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[6:7] offset:2 +; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 +; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[2:3] offset:3 +; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[2:3] offset:2 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1) @@ -1292,7 +1292,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; GFX9-GISEL-NEXT: v_min_u32_e32 v2, 32, v2 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, -1, v2, vcc -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %val = load i32, ptr addrspace(1) %arrayidx, align 1 %ctlz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone @@ -1404,13 +1404,13 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX9-GISEL-LABEL: v_cttz_i32_sel_ne_bitwidth: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] -; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[6:7] offset:1 -; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[6:7] offset:3 -; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[6:7] offset:2 +; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 +; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[2:3] offset:3 +; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[2:3] offset:2 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1) @@ -1422,7 +1422,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; GFX9-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 32, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, -1, v1, vcc -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %val = load i32, ptr addrspace(1) %arrayidx, align 1 %ctlz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone @@ -1498,18 +1498,18 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX9-GISEL-LABEL: v_cttz_i8_sel_eq_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xff ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] +; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_or_b32_e32 v3, 0x100, v1 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v3, v3 ; GFX9-GISEL-NEXT: v_and_b32_e32 v3, 0xff, v3 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc -; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[4:5] +; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %val = load i8, ptr addrspace(1) %arrayidx, align 1 %ctlz = call i8 @llvm.cttz.i8(i8 %val, i1 false) nounwind readnone @@ -1597,12 +1597,12 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX9-GISEL-LABEL: v_cttz_i16_sel_eq_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] -; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[6:7] offset:1 +; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX9-GISEL-NEXT: v_or_b32_e32 v2, 0x10000, v1 @@ -1610,7 +1610,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-GISEL-NEXT: global_store_short v0, v1, s[4:5] +; GFX9-GISEL-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %val = load i16, ptr addrspace(1) %arrayidx, align 1 %ctlz = call i16 @llvm.cttz.i16(i16 %val, i1 false) nounwind readnone diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll index 4226728dbe118..097604d57803e 100644 --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -933,24 +933,24 @@ define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr add ; ; GFX10-LABEL: load_i8_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ubyte v0, v0, s[6:7] +; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: load_i8_to_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v0, v0, s[6:7] +; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX9-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: load_i8_to_f32: @@ -1013,28 +1013,28 @@ define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr ; ; GFX10-LABEL: load_v2i8_to_v2f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ushort v0, v0, s[6:7] +; GFX10-NEXT: global_load_ushort v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: load_v2i8_to_v2f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v0, v0, s[6:7] +; GFX9-NEXT: global_load_ushort v0, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: load_v2i8_to_v2f32: @@ -1102,30 +1102,30 @@ define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr ; ; GFX10-LABEL: load_v3i8_to_v3f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[6:7] +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX10-NEXT: global_store_dwordx3 v3, v[0:2], s[4:5] +; GFX10-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: load_v3i8_to_v3f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[6:7] +; GFX9-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX9-NEXT: global_store_dwordx3 v3, v[0:2], s[4:5] +; GFX9-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: load_v3i8_to_v3f32: @@ -1194,32 +1194,32 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr ; ; GFX10-LABEL: load_v4i8_to_v4f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[6:7] +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: load_v4i8_to_v4f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[6:7] +; GFX9-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 ; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: load_v4i8_to_v4f32: @@ -1312,15 +1312,15 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias ; ; GFX10-LABEL: load_v4i8_to_v4f32_unaligned: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x3 -; GFX10-NEXT: global_load_ubyte v1, v0, s[6:7] offset:3 -; GFX10-NEXT: global_load_ubyte v2, v0, s[6:7] offset:2 -; GFX10-NEXT: global_load_ubyte v4, v0, s[6:7] offset:1 -; GFX10-NEXT: global_load_ubyte v5, v0, s[6:7] +; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] offset:3 +; GFX10-NEXT: global_load_ubyte v2, v0, s[2:3] offset:2 +; GFX10-NEXT: global_load_ubyte v4, v0, s[2:3] offset:1 +; GFX10-NEXT: global_load_ubyte v5, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(3) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v3, v1 ; GFX10-NEXT: s_waitcnt vmcnt(2) @@ -1329,19 +1329,19 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v4 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v5 -; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[4:5] +; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: load_v4i8_to_v4f32_unaligned: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v1, v0, s[6:7] offset:3 -; GFX9-NEXT: global_load_ubyte v2, v0, s[6:7] offset:2 -; GFX9-NEXT: global_load_ubyte v4, v0, s[6:7] offset:1 -; GFX9-NEXT: global_load_ubyte v5, v0, s[6:7] +; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3] offset:3 +; GFX9-NEXT: global_load_ubyte v2, v0, s[2:3] offset:2 +; GFX9-NEXT: global_load_ubyte v4, v0, s[2:3] offset:1 +; GFX9-NEXT: global_load_ubyte v5, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, v1 ; GFX9-NEXT: s_waitcnt vmcnt(2) @@ -1350,7 +1350,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v5 -; GFX9-NEXT: global_store_dwordx4 v6, v[0:3], s[4:5] +; GFX9-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: load_v4i8_to_v4f32_unaligned: @@ -1643,12 +1643,12 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; ; GFX10-LABEL: load_v4i8_to_v4f32_2_uses: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff00, v0 @@ -1666,21 +1666,22 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX10-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] -; GFX10-NEXT: global_store_dword v4, v5, s[6:7] +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX10-NEXT: global_store_dword v4, v5, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: load_v4i8_to_v4f32_2_uses: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_movk_i32 s4, 0xff00 ; GFX9-NEXT: v_mov_b32_e32 v6, 9 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v4, v0, s[0:1] -; GFX9-NEXT: s_movk_i32 s0, 0xff00 -; GFX9-NEXT: s_movk_i32 s1, 0x900 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_movk_i32 s5, 0x900 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v3, v4 ; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v2, v4 @@ -1688,16 +1689,17 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffffff00, v4 ; GFX9-NEXT: v_add_u16_e32 v8, 9, v4 -; GFX9-NEXT: v_and_b32_sdwa v9, v4, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v9, v4, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_add_u16_sdwa v4, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_or_b32_sdwa v0, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_or_b32_sdwa v1, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v0, 0x900, v0 -; GFX9-NEXT: v_add_u16_sdwa v1, v1, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u16_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX9-NEXT: global_store_dword v5, v0, s[6:7] +; GFX9-NEXT: global_store_dword v5, v0, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: load_v4i8_to_v4f32_2_uses: @@ -1837,17 +1839,17 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr ; ; GFX10-LABEL: load_v7i8_to_v7f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x5 -; GFX10-NEXT: global_load_ubyte v4, v0, s[6:7] offset:6 -; GFX10-NEXT: global_load_ubyte v1, v0, s[6:7] offset:3 -; GFX10-NEXT: global_load_ubyte v2, v0, s[6:7] offset:2 -; GFX10-NEXT: global_load_ubyte v5, v0, s[6:7] offset:1 -; GFX10-NEXT: global_load_short_d16 v7, v0, s[6:7] offset:4 -; GFX10-NEXT: global_load_ubyte v0, v0, s[6:7] +; GFX10-NEXT: global_load_ubyte v4, v0, s[2:3] offset:6 +; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] offset:3 +; GFX10-NEXT: global_load_ubyte v2, v0, s[2:3] offset:2 +; GFX10-NEXT: global_load_ubyte v5, v0, s[2:3] offset:1 +; GFX10-NEXT: global_load_short_d16 v7, v0, s[2:3] offset:4 +; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(5) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v6, v4 ; GFX10-NEXT: s_waitcnt vmcnt(4) @@ -1861,22 +1863,22 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v4, v7 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX10-NEXT: global_store_dwordx3 v8, v[4:6], s[4:5] offset:16 -; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5] +; GFX10-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] offset:16 +; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: load_v7i8_to_v7f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v10, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v1, v0, s[6:7] offset:6 -; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] offset:4 -; GFX9-NEXT: global_load_ubyte v3, v0, s[6:7] offset:3 -; GFX9-NEXT: global_load_ubyte v7, v0, s[6:7] offset:2 -; GFX9-NEXT: global_load_ubyte v8, v0, s[6:7] offset:1 -; GFX9-NEXT: global_load_ubyte v9, v0, s[6:7] +; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3] offset:6 +; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] offset:4 +; GFX9-NEXT: global_load_ubyte v3, v0, s[2:3] offset:3 +; GFX9-NEXT: global_load_ubyte v7, v0, s[2:3] offset:2 +; GFX9-NEXT: global_load_ubyte v8, v0, s[2:3] offset:1 +; GFX9-NEXT: global_load_ubyte v9, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v6, v1 ; GFX9-NEXT: s_waitcnt vmcnt(4) @@ -1890,8 +1892,8 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, v8 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v9 -; GFX9-NEXT: global_store_dwordx4 v10, v[0:3], s[4:5] -; GFX9-NEXT: global_store_dwordx3 v10, v[4:6], s[4:5] offset:16 +; GFX9-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] +; GFX9-NEXT: global_store_dwordx3 v10, v[4:6], s[0:1] offset:16 ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: load_v7i8_to_v7f32: @@ -1988,11 +1990,11 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr ; ; GFX10-LABEL: load_v8i8_to_v8f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: v_mov_b32_e32 v10, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[8:9], v0, s[6:7] +; GFX10-NEXT: global_load_dwordx2 v[8:9], v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v7, v9 ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v6, v9 @@ -2002,17 +2004,17 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v8 ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v8 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v8 -; GFX10-NEXT: global_store_dwordx4 v10, v[4:7], s[4:5] offset:16 -; GFX10-NEXT: global_store_dwordx4 v10, v[0:3], s[4:5] +; GFX10-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] offset:16 +; GFX10-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: load_v8i8_to_v8f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v9, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[7:8], v0, s[6:7] +; GFX9-NEXT: global_load_dwordx2 v[7:8], v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v3, v7 ; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v2, v7 @@ -2022,8 +2024,8 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr ; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v6, v8 ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v5, v8 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v4, v8 -; GFX9-NEXT: global_store_dwordx4 v9, v[4:7], s[4:5] offset:16 -; GFX9-NEXT: global_store_dwordx4 v9, v[0:3], s[4:5] +; GFX9-NEXT: global_store_dwordx4 v9, v[4:7], s[0:1] offset:16 +; GFX9-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: load_v8i8_to_v8f32: @@ -2096,28 +2098,28 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %ou ; ; GFX10-LABEL: i8_zext_inreg_i32_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[6:7] +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v0, 2, v0 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: i8_zext_inreg_i32_to_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[6:7] +; GFX9-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v0, 2, v0 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX9-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: i8_zext_inreg_i32_to_f32: @@ -2182,26 +2184,26 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %ou ; ; GFX10-LABEL: i8_zext_inreg_hi1_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[6:7] +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: i8_zext_inreg_hi1_to_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[6:7] +; GFX9-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 -; GFX9-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: i8_zext_inreg_hi1_to_f32: @@ -2264,24 +2266,24 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr ; ; GFX10-LABEL: i8_zext_i32_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ubyte v0, v0, s[6:7] +; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: i8_zext_i32_to_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v0, v0, s[6:7] +; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX9-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: i8_zext_i32_to_f32: @@ -2367,15 +2369,15 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou ; ; GFX10-LABEL: v4i8_zext_v4i32_to_v4f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x3 -; GFX10-NEXT: global_load_ubyte v1, v0, s[6:7] offset:3 -; GFX10-NEXT: global_load_ubyte v2, v0, s[6:7] offset:2 -; GFX10-NEXT: global_load_ubyte v4, v0, s[6:7] offset:1 -; GFX10-NEXT: global_load_ubyte v5, v0, s[6:7] +; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] offset:3 +; GFX10-NEXT: global_load_ubyte v2, v0, s[2:3] offset:2 +; GFX10-NEXT: global_load_ubyte v4, v0, s[2:3] offset:1 +; GFX10-NEXT: global_load_ubyte v5, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(3) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v3, v1 ; GFX10-NEXT: s_waitcnt vmcnt(2) @@ -2384,19 +2386,19 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v4 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v5 -; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[4:5] +; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: v4i8_zext_v4i32_to_v4f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v1, v0, s[6:7] offset:3 -; GFX9-NEXT: global_load_ubyte v2, v0, s[6:7] offset:2 -; GFX9-NEXT: global_load_ubyte v4, v0, s[6:7] offset:1 -; GFX9-NEXT: global_load_ubyte v5, v0, s[6:7] +; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3] offset:3 +; GFX9-NEXT: global_load_ubyte v2, v0, s[2:3] offset:2 +; GFX9-NEXT: global_load_ubyte v4, v0, s[2:3] offset:1 +; GFX9-NEXT: global_load_ubyte v5, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, v1 ; GFX9-NEXT: s_waitcnt vmcnt(2) @@ -2405,7 +2407,7 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v5 -; GFX9-NEXT: global_store_dwordx4 v6, v[0:3], s[4:5] +; GFX9-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v4i8_zext_v4i32_to_v4f32: @@ -2477,26 +2479,26 @@ define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, p ; ; GFX10-LABEL: extract_byte0_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[6:7] +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: extract_byte0_to_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[6:7] +; GFX9-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX9-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: extract_byte0_to_f32: @@ -2558,26 +2560,26 @@ define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, p ; ; GFX10-LABEL: extract_byte1_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[6:7] +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: extract_byte1_to_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[6:7] +; GFX9-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 -; GFX9-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: extract_byte1_to_f32: @@ -2640,26 +2642,26 @@ define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, p ; ; GFX10-LABEL: extract_byte2_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[6:7] +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: extract_byte2_to_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[6:7] +; GFX9-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 -; GFX9-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: extract_byte2_to_f32: @@ -2722,26 +2724,26 @@ define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, p ; ; GFX10-LABEL: extract_byte3_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[6:7] +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: extract_byte3_to_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[6:7] +; GFX9-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 -; GFX9-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: extract_byte3_to_f32: @@ -2823,16 +2825,16 @@ define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addr ; ; GFX9-LABEL: cvt_ubyte0_or_multiuse: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[4:5] +; GFX9-NEXT: global_load_dword v0, v0, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_e32 v0, 0x80000001, v0 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, v0 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX9-NEXT: global_store_dword v1, v0, s[6:7] +; GFX9-NEXT: global_store_dword v1, v0, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: cvt_ubyte0_or_multiuse: diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll index 739fff5084135..8f31bb1fe0a81 100644 --- a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll +++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll @@ -376,22 +376,22 @@ define amdgpu_kernel void @uniform_vec_i16_LH(ptr addrspace(1) %out, i16 %a, i32 ; ; GFX9-LABEL: uniform_vec_i16_LH: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_pack_lh_b32_b16 s0, s6, s7 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: s_pack_lh_b32_b16 s2, s2, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX906-LABEL: uniform_vec_i16_LH: ; GFX906: ; %bb.0: -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: s_pack_lh_b32_b16 s0, s6, s7 -; GFX906-NEXT: v_mov_b32_e32 v1, s0 -; GFX906-NEXT: global_store_dword v0, v1, s[4:5] +; GFX906-NEXT: s_pack_lh_b32_b16 s2, s2, s3 +; GFX906-NEXT: v_mov_b32_e32 v1, s2 +; GFX906-NEXT: global_store_dword v0, v1, s[0:1] ; GFX906-NEXT: s_endpgm ; ; GFX11-LABEL: uniform_vec_i16_LH: @@ -466,22 +466,22 @@ define amdgpu_kernel void @uniform_vec_i16_HH(ptr addrspace(1) %out, i32 %a, i32 ; ; GFX9-LABEL: uniform_vec_i16_HH: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_pack_hh_b32_b16 s0, s6, s7 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: s_pack_hh_b32_b16 s2, s2, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX906-LABEL: uniform_vec_i16_HH: ; GFX906: ; %bb.0: -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: s_pack_hh_b32_b16 s0, s6, s7 -; GFX906-NEXT: v_mov_b32_e32 v1, s0 -; GFX906-NEXT: global_store_dword v0, v1, s[4:5] +; GFX906-NEXT: s_pack_hh_b32_b16 s2, s2, s3 +; GFX906-NEXT: v_mov_b32_e32 v1, s2 +; GFX906-NEXT: global_store_dword v0, v1, s[0:1] ; GFX906-NEXT: s_endpgm ; ; GFX11-LABEL: uniform_vec_i16_HH: diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll index ae280c5a443e1..b8936911f0576 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll @@ -1831,21 +1831,21 @@ define amdgpu_kernel void @s_copysign_v2f16(ptr addrspace(1) %arg_out, <2 x half ; ; GFX9-LABEL: s_copysign_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_movk_i32 s0, 0x7fff +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-NEXT: s_lshr_b32 s1, s7, 16 -; GFX9-NEXT: s_lshr_b32 s2, s6, 16 -; GFX9-NEXT: v_bfi_b32 v1, s0, v1, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: s_lshr_b32 s3, s3, 16 +; GFX9-NEXT: s_lshr_b32 s2, s2, 16 +; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v2 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_bfi_b32 v2, s0, v2, v3 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_bfi_b32 v2, s4, v2, v3 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_copysign_v2f16: diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.ll b/llvm/test/CodeGen/AMDGPU/fdiv.ll index 6c5b2917855fc..0468175c5df50 100644 --- a/llvm/test/CodeGen/AMDGPU/fdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv.ll @@ -118,10 +118,10 @@ define amdgpu_kernel void @s_fdiv_f32_ninf(ptr addrspace(1) %out, float %a, floa ; ; GFX10-LABEL: s_fdiv_f32_ninf: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s0, s7, s7, s6 -; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s6, s7, s6 +; GFX10-NEXT: v_div_scale_f32 v0, s4, s3, s3, s2 +; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s2, s3, s2 ; GFX10-NEXT: v_rcp_f32_e32 v1, v0 ; GFX10-NEXT: s_denorm_mode 15 ; GFX10-NEXT: v_fma_f32 v3, -v0, v1, 1.0 @@ -133,8 +133,8 @@ define amdgpu_kernel void @s_fdiv_f32_ninf(ptr addrspace(1) %out, float %a, floa ; GFX10-NEXT: s_denorm_mode 12 ; GFX10-NEXT: v_div_fmas_f32 v0, v0, v1, v3 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_div_fixup_f32 v0, v0, s7, s6 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: v_div_fixup_f32 v0, v0, s3, s2 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_f32_ninf: @@ -275,21 +275,21 @@ define amdgpu_kernel void @s_fdiv_f32_ieee(ptr addrspace(1) %out, float %a, floa ; ; GFX10-LABEL: s_fdiv_f32_ieee: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s0, s7, s7, s6 +; GFX10-NEXT: v_div_scale_f32 v0, s4, s3, s3, s2 ; GFX10-NEXT: v_rcp_f32_e32 v1, v0 ; GFX10-NEXT: v_fma_f32 v2, -v0, v1, 1.0 ; GFX10-NEXT: v_fmac_f32_e32 v1, v2, v1 -; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s6, s7, s6 +; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s2, s3, s2 ; GFX10-NEXT: v_mul_f32_e32 v3, v2, v1 ; GFX10-NEXT: v_fma_f32 v4, -v0, v3, v2 ; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v1 ; GFX10-NEXT: v_fma_f32 v0, -v0, v3, v2 ; GFX10-NEXT: v_div_fmas_f32 v0, v0, v1, v3 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_div_fixup_f32 v0, v0, s7, s6 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: v_div_fixup_f32 v0, v0, s3, s2 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_f32_ieee: @@ -370,16 +370,16 @@ define amdgpu_kernel void @s_fdiv_25ulp_f32(ptr addrspace(1) %out, float %a, flo ; ; GFX10-LABEL: s_fdiv_25ulp_f32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cmp_lt_f32_e64 s0, 0x6f800000, |s7| -; GFX10-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x2f800000, s0 -; GFX10-NEXT: v_mul_f32_e32 v1, s7, v0 +; GFX10-NEXT: v_cmp_lt_f32_e64 s4, 0x6f800000, |s3| +; GFX10-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x2f800000, s4 +; GFX10-NEXT: v_mul_f32_e32 v1, s3, v0 ; GFX10-NEXT: v_rcp_f32_e32 v1, v1 -; GFX10-NEXT: v_mul_f32_e32 v1, s6, v1 +; GFX10-NEXT: v_mul_f32_e32 v1, s2, v1 ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX10-NEXT: global_store_dword v2, v0, s[4:5] +; GFX10-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_25ulp_f32: @@ -482,18 +482,18 @@ define amdgpu_kernel void @s_fdiv_25ulp_ieee_f32(ptr addrspace(1) %out, float %a ; ; GFX10-LABEL: s_fdiv_25ulp_ieee_f32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_frexp_mant_f32_e32 v0, s7 -; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v1, s7 -; GFX10-NEXT: v_frexp_mant_f32_e32 v2, s6 -; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v3, s6 +; GFX10-NEXT: v_frexp_mant_f32_e32 v0, s3 +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v1, s3 +; GFX10-NEXT: v_frexp_mant_f32_e32 v2, s2 +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v3, s2 ; GFX10-NEXT: v_rcp_f32_e32 v0, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, v3, v1 ; GFX10-NEXT: v_mul_f32_e32 v0, v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX10-NEXT: global_store_dword v2, v0, s[4:5] +; GFX10-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_25ulp_ieee_f32: @@ -559,12 +559,12 @@ define amdgpu_kernel void @s_fdiv_fast_ieee_f32(ptr addrspace(1) %out, float %a, ; ; GFX10-LABEL: s_fdiv_fast_ieee_f32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_rcp_f32_e32 v0, s7 -; GFX10-NEXT: v_mul_f32_e32 v0, s6, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: v_rcp_f32_e32 v0, s3 +; GFX10-NEXT: v_mul_f32_e32 v0, s2, v0 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_fast_ieee_f32: @@ -623,12 +623,12 @@ define amdgpu_kernel void @s_fdiv_f32_fast_math(ptr addrspace(1) %out, float %a, ; ; GFX10-LABEL: s_fdiv_f32_fast_math: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_rcp_f32_e32 v0, s7 -; GFX10-NEXT: v_mul_f32_e32 v0, s6, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: v_rcp_f32_e32 v0, s3 +; GFX10-NEXT: v_mul_f32_e32 v0, s2, v0 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_f32_fast_math: @@ -687,12 +687,12 @@ define amdgpu_kernel void @s_fdiv_ulp25_f32_fast_math(ptr addrspace(1) %out, flo ; ; GFX10-LABEL: s_fdiv_ulp25_f32_fast_math: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_rcp_f32_e32 v0, s7 -; GFX10-NEXT: v_mul_f32_e32 v0, s6, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: v_rcp_f32_e32 v0, s3 +; GFX10-NEXT: v_mul_f32_e32 v0, s2, v0 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_ulp25_f32_fast_math: @@ -829,10 +829,10 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_daz(ptr addrspace(1) %out, float %a, ; ; GFX10-LABEL: s_fdiv_f32_arcp_daz: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s0, s7, s7, s6 -; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s6, s7, s6 +; GFX10-NEXT: v_div_scale_f32 v0, s4, s3, s3, s2 +; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s2, s3, s2 ; GFX10-NEXT: v_rcp_f32_e32 v1, v0 ; GFX10-NEXT: s_denorm_mode 15 ; GFX10-NEXT: v_fma_f32 v3, -v0, v1, 1.0 @@ -844,8 +844,8 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_daz(ptr addrspace(1) %out, float %a, ; GFX10-NEXT: s_denorm_mode 12 ; GFX10-NEXT: v_div_fmas_f32 v0, v0, v1, v3 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_div_fixup_f32 v0, v0, s7, s6 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: v_div_fixup_f32 v0, v0, s3, s2 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_f32_arcp_daz: @@ -916,12 +916,12 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_ninf(ptr addrspace(1) %out, float %a, ; ; GFX10-LABEL: s_fdiv_f32_arcp_ninf: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_rcp_f32_e32 v0, s7 -; GFX10-NEXT: v_mul_f32_e32 v0, s6, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: v_rcp_f32_e32 v0, s3 +; GFX10-NEXT: v_mul_f32_e32 v0, s2, v0 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_f32_arcp_ninf: diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll index 26714dcc6dfac..e44572985e6d2 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll @@ -4315,12 +4315,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr %out, i32 %in, i32 %old ; ; GCN3-LABEL: atomic_cmpxchg_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v3, s3 ; GCN3-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -4570,12 +4570,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32(ptr %out, i32 %in, i32 %old) { ; ; GCN3-LABEL: atomic_cmpxchg_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v3, s3 ; GCN3-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll index 66aacd7062a6d..5bd527149572e 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll @@ -3883,13 +3883,13 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 % ; ; GCN3-LABEL: atomic_max_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_ashr_i32 s1, s7, 31 -; GCN3-NEXT: s_mov_b32 s0, s7 -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_ashr_i32 s5, s3, 31 +; GCN3-NEXT: s_mov_b32 s4, s3 +; GCN3-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; GCN3-NEXT: s_add_u32 s0, s0, s4 +; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 @@ -3897,7 +3897,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN3-NEXT: .LBB88_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_max_i32_e32 v2, s6, v3 +; GCN3-NEXT: v_max_i32_e32 v2, s2, v3 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -4085,13 +4085,13 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index) ; ; GCN3-LABEL: atomic_max_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_ashr_i32 s1, s7, 31 -; GCN3-NEXT: s_mov_b32 s0, s7 -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_ashr_i32 s5, s3, 31 +; GCN3-NEXT: s_mov_b32 s4, s3 +; GCN3-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; GCN3-NEXT: s_add_u32 s0, s0, s4 +; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v3, v[0:1] @@ -4099,7 +4099,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index) ; GCN3-NEXT: .LBB90_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_max_i32_e32 v2, s6, v3 +; GCN3-NEXT: v_max_i32_e32 v2, s2, v3 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -5026,13 +5026,13 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32 ; ; GCN3-LABEL: atomic_umax_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_ashr_i32 s1, s7, 31 -; GCN3-NEXT: s_mov_b32 s0, s7 -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_ashr_i32 s5, s3, 31 +; GCN3-NEXT: s_mov_b32 s4, s3 +; GCN3-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; GCN3-NEXT: s_add_u32 s0, s0, s4 +; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 @@ -5040,7 +5040,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32 ; GCN3-NEXT: .LBB102_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_max_u32_e32 v2, s6, v3 +; GCN3-NEXT: v_max_u32_e32 v2, s2, v3 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -6820,13 +6820,13 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 % ; ; GCN3-LABEL: atomic_min_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_ashr_i32 s1, s7, 31 -; GCN3-NEXT: s_mov_b32 s0, s7 -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_ashr_i32 s5, s3, 31 +; GCN3-NEXT: s_mov_b32 s4, s3 +; GCN3-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; GCN3-NEXT: s_add_u32 s0, s0, s4 +; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 @@ -6834,7 +6834,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN3-NEXT: .LBB125_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_min_i32_e32 v2, s6, v3 +; GCN3-NEXT: v_min_i32_e32 v2, s2, v3 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol diff --git a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll index 9943976dd86da..718be90eb75fc 100644 --- a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll @@ -1389,49 +1389,49 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out, ; ; GFX10-FLUSH-LABEL: mad_sub_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) ; GFX10-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX10-FLUSH-NEXT: v_sub_f16_e32 v1, v1, v3 -; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-FLUSH-NEXT: s_endpgm ; ; GFX10-DENORM-STRICT-LABEL: mad_sub_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-STRICT-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX10-DENORM-STRICT-NEXT: v_sub_f16_e32 v1, v1, v3 -; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-DENORM-STRICT-NEXT: s_endpgm ; ; GFX10-DENORM-CONTRACT-LABEL: mad_sub_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: v_fma_f16 v1, v1, v2, -v3 -; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-DENORM-CONTRACT-NEXT: s_endpgm ; ; GFX11-FLUSH-LABEL: mad_sub_f16: @@ -1558,49 +1558,49 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o ; ; GFX10-FLUSH-LABEL: mad_sub_inv_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) ; GFX10-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX10-FLUSH-NEXT: v_sub_f16_e32 v1, v3, v1 -; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-FLUSH-NEXT: s_endpgm ; ; GFX10-DENORM-STRICT-LABEL: mad_sub_inv_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-STRICT-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX10-DENORM-STRICT-NEXT: v_sub_f16_e32 v1, v3, v1 -; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-DENORM-STRICT-NEXT: s_endpgm ; ; GFX10-DENORM-CONTRACT-LABEL: mad_sub_inv_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: v_fma_f16 v1, -v1, v2, v3 -; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-DENORM-CONTRACT-NEXT: s_endpgm ; ; GFX11-FLUSH-LABEL: mad_sub_inv_f16: @@ -1727,49 +1727,49 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture % ; ; GFX10-FLUSH-LABEL: mad_sub_fabs_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) ; GFX10-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX10-FLUSH-NEXT: v_sub_f16_e64 v1, v1, |v3| -; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-FLUSH-NEXT: s_endpgm ; ; GFX10-DENORM-STRICT-LABEL: mad_sub_fabs_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-STRICT-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX10-DENORM-STRICT-NEXT: v_sub_f16_e64 v1, v1, |v3| -; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-DENORM-STRICT-NEXT: s_endpgm ; ; GFX10-DENORM-CONTRACT-LABEL: mad_sub_fabs_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: v_fma_f16 v1, v1, v2, -|v3| -; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-DENORM-CONTRACT-NEXT: s_endpgm ; ; GFX11-FLUSH-LABEL: mad_sub_fabs_f16: @@ -1897,49 +1897,49 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu ; ; GFX10-FLUSH-LABEL: mad_sub_fabs_inv_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) ; GFX10-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX10-FLUSH-NEXT: v_sub_f16_e64 v1, |v3|, v1 -; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-FLUSH-NEXT: s_endpgm ; ; GFX10-DENORM-STRICT-LABEL: mad_sub_fabs_inv_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-STRICT-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX10-DENORM-STRICT-NEXT: v_sub_f16_e64 v1, |v3|, v1 -; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-DENORM-STRICT-NEXT: s_endpgm ; ; GFX10-DENORM-CONTRACT-LABEL: mad_sub_fabs_inv_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: v_fma_f16 v1, -v1, v2, |v3| -; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-DENORM-CONTRACT-NEXT: s_endpgm ; ; GFX11-FLUSH-LABEL: mad_sub_fabs_inv_f16: @@ -2067,49 +2067,49 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o ; ; GFX10-FLUSH-LABEL: neg_neg_mad_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) ; GFX10-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v3, v1 -; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-FLUSH-NEXT: s_endpgm ; ; GFX10-DENORM-STRICT-LABEL: neg_neg_mad_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-STRICT-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX10-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v3, v1 -; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-DENORM-STRICT-NEXT: s_endpgm ; ; GFX10-DENORM-CONTRACT-LABEL: neg_neg_mad_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: v_fmac_f16_e32 v3, v1, v2 -; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v3, s[4:5] +; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v3, s[0:1] ; GFX10-DENORM-CONTRACT-NEXT: s_endpgm ; ; GFX11-FLUSH-LABEL: neg_neg_mad_f16: @@ -2238,49 +2238,49 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture % ; ; GFX10-FLUSH-LABEL: mad_fabs_sub_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) ; GFX10-FLUSH-NEXT: v_mul_f16_e64 v1, v1, |v2| ; GFX10-FLUSH-NEXT: v_sub_f16_e32 v1, v1, v3 -; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-FLUSH-NEXT: s_endpgm ; ; GFX10-DENORM-STRICT-LABEL: mad_fabs_sub_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-STRICT-NEXT: v_mul_f16_e64 v1, v1, |v2| ; GFX10-DENORM-STRICT-NEXT: v_sub_f16_e32 v1, v1, v3 -; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-DENORM-STRICT-NEXT: s_endpgm ; ; GFX10-DENORM-CONTRACT-LABEL: mad_fabs_sub_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: v_fma_f16 v1, v1, |v2|, -v3 -; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-DENORM-CONTRACT-NEXT: s_endpgm ; ; GFX11-FLUSH-LABEL: mad_fabs_sub_f16: diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll index f55e9f4821b47..5761c198e20ba 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll @@ -316,10 +316,10 @@ define <2 x i16> @global_atomic_fadd_v2bf16_rtn(ptr addrspace(1) %ptr, <2 x i16> define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, <2 x half> %data) { ; GFX940-LABEL: local_atomic_fadd_v2f16_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s2 -; GFX940-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NEXT: v_mov_b32_e32 v1, s1 ; GFX940-NEXT: ds_pk_add_f16 v0, v1 ; GFX940-NEXT: s_endpgm ; @@ -359,10 +359,10 @@ define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half> define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, <2 x i16> %data) { ; GFX940-LABEL: local_atomic_fadd_v2bf16_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s2 -; GFX940-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NEXT: v_mov_b32_e32 v1, s1 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: ds_pk_add_bf16 v0, v1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/fshl.ll b/llvm/test/CodeGen/AMDGPU/fshl.ll index 3c4087fe391b6..4ea3323a9dbfc 100644 --- a/llvm/test/CodeGen/AMDGPU/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/fshl.ll @@ -137,12 +137,12 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX9-LABEL: fshl_i32_imm: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_alignbit_b32 v1, s6, v1, 25 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_alignbit_b32 v1, s2, v1, 25 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshl_i32_imm: @@ -159,11 +159,11 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX10-LABEL: fshl_i32_imm: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v1, s6, s7, 25 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: v_alignbit_b32 v1, s2, s3, 25 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshl_i32_imm: @@ -734,15 +734,15 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) { ; ; GFX9-LABEL: orxor2or1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s0, s6, 7 -; GFX9-NEXT: s_or_b32 s0, s7, s0 -; GFX9-NEXT: s_cmp_eq_u32 s0, 0 -; GFX9-NEXT: s_cselect_b32 s0, s6, s7 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: s_lshl_b32 s4, s2, 7 +; GFX9-NEXT: s_or_b32 s4, s3, s4 +; GFX9-NEXT: s_cmp_eq_u32 s4, 0 +; GFX9-NEXT: s_cselect_b32 s2, s2, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: orxor2or1: @@ -761,15 +761,15 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) { ; ; GFX10-LABEL: orxor2or1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_lshl_b32 s0, s6, 7 -; GFX10-NEXT: s_or_b32 s0, s7, s0 -; GFX10-NEXT: s_cmp_eq_u32 s0, 0 -; GFX10-NEXT: s_cselect_b32 s0, s6, s7 -; GFX10-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: s_lshl_b32 s4, s2, 7 +; GFX10-NEXT: s_or_b32 s4, s3, s4 +; GFX10-NEXT: s_cmp_eq_u32 s4, 0 +; GFX10-NEXT: s_cselect_b32 s2, s2, s3 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: orxor2or1: diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll index e8377763e4be2..e8310e73f9a47 100644 --- a/llvm/test/CodeGen/AMDGPU/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/fshr.ll @@ -129,12 +129,12 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX9-LABEL: fshr_i32_imm: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_alignbit_b32 v1, s6, v1, 7 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_alignbit_b32 v1, s2, v1, 7 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshr_i32_imm: @@ -151,11 +151,11 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX10-LABEL: fshr_i32_imm: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v1, s6, s7, 7 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: v_alignbit_b32 v1, s2, s3, 7 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshr_i32_imm: diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics.ll b/llvm/test/CodeGen/AMDGPU/global_atomics.ll index d4398e5367c7f..dac3a3db7b450 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics.ll @@ -4155,12 +4155,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr addrspace(1) %out, i32 ; ; GFX9-LABEL: atomic_cmpxchg_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -4406,12 +4406,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32(ptr addrspace(1) %out, i32 %in, i3 ; ; GFX9-LABEL: atomic_cmpxchg_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll index 1fa7c52a68802..516c92f1640ea 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll @@ -4679,28 +4679,28 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_max_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s1, s7, 31 -; GFX9-NEXT: s_mov_b32 s0, s7 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x10 +; GFX9-NEXT: s_ashr_i32 s5, s3, 31 +; GFX9-NEXT: s_mov_b32 s4, s3 +; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; GFX9-NEXT: s_add_u32 s0, s0, s4 +; GFX9-NEXT: s_addc_u32 s1, s1, s5 +; GFX9-NEXT: s_load_dword s3, s[0:1], 0x10 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: .LBB91_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_max_i32_e32 v0, s6, v1 +; GFX9-NEXT: v_max_i32_e32 v0, s2, v1 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_cbranch_execnz .LBB91_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm @@ -4890,28 +4890,28 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; GFX9-LABEL: atomic_max_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s1, s7, 31 -; GFX9-NEXT: s_mov_b32 s0, s7 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: s_ashr_i32 s5, s3, 31 +; GFX9-NEXT: s_mov_b32 s4, s3 +; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; GFX9-NEXT: s_add_u32 s0, s0, s4 +; GFX9-NEXT: s_addc_u32 s1, s1, s5 +; GFX9-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: .LBB93_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_max_i32_e32 v0, s6, v1 +; GFX9-NEXT: v_max_i32_e32 v0, s2, v1 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_cbranch_execnz .LBB93_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm @@ -5932,28 +5932,28 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, ; ; GFX9-LABEL: atomic_umax_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s1, s7, 31 -; GFX9-NEXT: s_mov_b32 s0, s7 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x10 +; GFX9-NEXT: s_ashr_i32 s5, s3, 31 +; GFX9-NEXT: s_mov_b32 s4, s3 +; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; GFX9-NEXT: s_add_u32 s0, s0, s4 +; GFX9-NEXT: s_addc_u32 s1, s1, s5 +; GFX9-NEXT: s_load_dword s3, s[0:1], 0x10 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: .LBB105_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_max_u32_e32 v0, s6, v1 +; GFX9-NEXT: v_max_u32_e32 v0, s2, v1 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_cbranch_execnz .LBB105_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm @@ -7923,28 +7923,28 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_min_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s1, s7, 31 -; GFX9-NEXT: s_mov_b32 s0, s7 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x10 +; GFX9-NEXT: s_ashr_i32 s5, s3, 31 +; GFX9-NEXT: s_mov_b32 s4, s3 +; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; GFX9-NEXT: s_add_u32 s0, s0, s4 +; GFX9-NEXT: s_addc_u32 s1, s1, s5 +; GFX9-NEXT: s_load_dword s3, s[0:1], 0x10 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: .LBB128_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_min_i32_e32 v0, s6, v1 +; GFX9-NEXT: v_min_i32_e32 v0, s2, v1 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_cbranch_execnz .LBB128_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll index d5265e364a17e..df03e89370377 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll @@ -436,121 +436,121 @@ entry: define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; GFX9-LABEL: udiv_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX9-NEXT: s_sub_i32 s0, 0, s7 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-NEXT: s_sub_i32 s4, 0, s3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s1, v0 -; GFX9-NEXT: s_mul_i32 s0, s0, s1 -; GFX9-NEXT: s_mul_hi_u32 s0, s1, s0 -; GFX9-NEXT: s_add_i32 s1, s1, s0 -; GFX9-NEXT: s_mul_hi_u32 s0, s6, s1 -; GFX9-NEXT: s_mul_i32 s1, s0, s7 -; GFX9-NEXT: s_sub_i32 s1, s6, s1 -; GFX9-NEXT: s_add_i32 s2, s0, 1 -; GFX9-NEXT: s_sub_i32 s3, s1, s7 -; GFX9-NEXT: s_cmp_ge_u32 s1, s7 -; GFX9-NEXT: s_cselect_b32 s0, s2, s0 -; GFX9-NEXT: s_cselect_b32 s1, s3, s1 -; GFX9-NEXT: s_add_i32 s2, s0, 1 -; GFX9-NEXT: s_cmp_ge_u32 s1, s7 -; GFX9-NEXT: s_cselect_b32 s0, s2, s0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-NEXT: v_readfirstlane_b32 s5, v0 +; GFX9-NEXT: s_mul_i32 s4, s4, s5 +; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4 +; GFX9-NEXT: s_add_i32 s5, s5, s4 +; GFX9-NEXT: s_mul_hi_u32 s4, s2, s5 +; GFX9-NEXT: s_mul_i32 s5, s4, s3 +; GFX9-NEXT: s_sub_i32 s2, s2, s5 +; GFX9-NEXT: s_add_i32 s6, s4, 1 +; GFX9-NEXT: s_sub_i32 s5, s2, s3 +; GFX9-NEXT: s_cmp_ge_u32 s2, s3 +; GFX9-NEXT: s_cselect_b32 s4, s6, s4 +; GFX9-NEXT: s_cselect_b32 s2, s5, s2 +; GFX9-NEXT: s_add_i32 s5, s4, 1 +; GFX9-NEXT: s_cmp_ge_u32 s2, s3 +; GFX9-NEXT: s_cselect_b32 s2, s5, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; ; GFX90A-LABEL: udiv_i32: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, 0 -; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX90A-NEXT: s_sub_i32 s0, 0, s7 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX90A-NEXT: s_sub_i32 s4, 0, s3 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX90A-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX90A-NEXT: v_readfirstlane_b32 s1, v0 -; GFX90A-NEXT: s_mul_i32 s0, s0, s1 -; GFX90A-NEXT: s_mul_hi_u32 s0, s1, s0 -; GFX90A-NEXT: s_add_i32 s1, s1, s0 -; GFX90A-NEXT: s_mul_hi_u32 s0, s6, s1 -; GFX90A-NEXT: s_mul_i32 s1, s0, s7 -; GFX90A-NEXT: s_sub_i32 s1, s6, s1 -; GFX90A-NEXT: s_add_i32 s2, s0, 1 -; GFX90A-NEXT: s_sub_i32 s3, s1, s7 -; GFX90A-NEXT: s_cmp_ge_u32 s1, s7 -; GFX90A-NEXT: s_cselect_b32 s0, s2, s0 -; GFX90A-NEXT: s_cselect_b32 s1, s3, s1 -; GFX90A-NEXT: s_add_i32 s2, s0, 1 -; GFX90A-NEXT: s_cmp_ge_u32 s1, s7 -; GFX90A-NEXT: s_cselect_b32 s0, s2, s0 -; GFX90A-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NEXT: global_store_dword v1, v0, s[4:5] +; GFX90A-NEXT: v_readfirstlane_b32 s5, v0 +; GFX90A-NEXT: s_mul_i32 s4, s4, s5 +; GFX90A-NEXT: s_mul_hi_u32 s4, s5, s4 +; GFX90A-NEXT: s_add_i32 s5, s5, s4 +; GFX90A-NEXT: s_mul_hi_u32 s4, s2, s5 +; GFX90A-NEXT: s_mul_i32 s5, s4, s3 +; GFX90A-NEXT: s_sub_i32 s2, s2, s5 +; GFX90A-NEXT: s_add_i32 s6, s4, 1 +; GFX90A-NEXT: s_sub_i32 s5, s2, s3 +; GFX90A-NEXT: s_cmp_ge_u32 s2, s3 +; GFX90A-NEXT: s_cselect_b32 s4, s6, s4 +; GFX90A-NEXT: s_cselect_b32 s2, s5, s2 +; GFX90A-NEXT: s_add_i32 s5, s4, 1 +; GFX90A-NEXT: s_cmp_ge_u32 s2, s3 +; GFX90A-NEXT: s_cselect_b32 s2, s5, s4 +; GFX90A-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-NEXT: global_store_dword v1, v0, s[0:1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_endpgm ; ; GFX10-LABEL: udiv_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX10-NEXT: s_sub_i32 s1, 0, s7 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX10-NEXT: s_sub_i32 s5, 0, s3 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s4, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_mul_i32 s1, s1, s0 -; GFX10-NEXT: s_mul_hi_u32 s1, s0, s1 -; GFX10-NEXT: s_add_i32 s0, s0, s1 -; GFX10-NEXT: s_mul_hi_u32 s0, s6, s0 -; GFX10-NEXT: s_mul_i32 s1, s0, s7 -; GFX10-NEXT: s_add_i32 s2, s0, 1 -; GFX10-NEXT: s_sub_i32 s1, s6, s1 -; GFX10-NEXT: s_sub_i32 s3, s1, s7 -; GFX10-NEXT: s_cmp_ge_u32 s1, s7 -; GFX10-NEXT: s_cselect_b32 s0, s2, s0 -; GFX10-NEXT: s_cselect_b32 s1, s3, s1 -; GFX10-NEXT: s_add_i32 s2, s0, 1 -; GFX10-NEXT: s_cmp_ge_u32 s1, s7 -; GFX10-NEXT: s_cselect_b32 s0, s2, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: s_mul_i32 s5, s5, s4 +; GFX10-NEXT: s_mul_hi_u32 s5, s4, s5 +; GFX10-NEXT: s_add_i32 s4, s4, s5 +; GFX10-NEXT: s_mul_hi_u32 s4, s2, s4 +; GFX10-NEXT: s_mul_i32 s5, s4, s3 +; GFX10-NEXT: s_sub_i32 s2, s2, s5 +; GFX10-NEXT: s_add_i32 s5, s4, 1 +; GFX10-NEXT: s_sub_i32 s6, s2, s3 +; GFX10-NEXT: s_cmp_ge_u32 s2, s3 +; GFX10-NEXT: s_cselect_b32 s4, s5, s4 +; GFX10-NEXT: s_cselect_b32 s2, s6, s2 +; GFX10-NEXT: s_add_i32 s5, s4, 1 +; GFX10-NEXT: s_cmp_ge_u32 s2, s3 +; GFX10-NEXT: s_cselect_b32 s2, s5, s4 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_endpgm ; ; GFX9-FLATSCR-LABEL: udiv_i32: ; GFX9-FLATSCR: ; %bb.0: -; GFX9-FLATSCR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-FLATSCR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-FLATSCR-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX9-FLATSCR-NEXT: s_sub_i32 s0, 0, s7 +; GFX9-FLATSCR-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-FLATSCR-NEXT: s_sub_i32 s4, 0, s3 ; GFX9-FLATSCR-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-FLATSCR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-FLATSCR-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-FLATSCR-NEXT: v_readfirstlane_b32 s1, v0 -; GFX9-FLATSCR-NEXT: s_mul_i32 s0, s0, s1 -; GFX9-FLATSCR-NEXT: s_mul_hi_u32 s0, s1, s0 -; GFX9-FLATSCR-NEXT: s_add_i32 s1, s1, s0 -; GFX9-FLATSCR-NEXT: s_mul_hi_u32 s0, s6, s1 -; GFX9-FLATSCR-NEXT: s_mul_i32 s1, s0, s7 -; GFX9-FLATSCR-NEXT: s_sub_i32 s1, s6, s1 -; GFX9-FLATSCR-NEXT: s_add_i32 s2, s0, 1 -; GFX9-FLATSCR-NEXT: s_sub_i32 s3, s1, s7 -; GFX9-FLATSCR-NEXT: s_cmp_ge_u32 s1, s7 -; GFX9-FLATSCR-NEXT: s_cselect_b32 s0, s2, s0 -; GFX9-FLATSCR-NEXT: s_cselect_b32 s1, s3, s1 -; GFX9-FLATSCR-NEXT: s_add_i32 s2, s0, 1 -; GFX9-FLATSCR-NEXT: s_cmp_ge_u32 s1, s7 -; GFX9-FLATSCR-NEXT: s_cselect_b32 s0, s2, s0 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-FLATSCR-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-FLATSCR-NEXT: v_readfirstlane_b32 s5, v0 +; GFX9-FLATSCR-NEXT: s_mul_i32 s4, s4, s5 +; GFX9-FLATSCR-NEXT: s_mul_hi_u32 s4, s5, s4 +; GFX9-FLATSCR-NEXT: s_add_i32 s5, s5, s4 +; GFX9-FLATSCR-NEXT: s_mul_hi_u32 s4, s2, s5 +; GFX9-FLATSCR-NEXT: s_mul_i32 s5, s4, s3 +; GFX9-FLATSCR-NEXT: s_sub_i32 s2, s2, s5 +; GFX9-FLATSCR-NEXT: s_add_i32 s6, s4, 1 +; GFX9-FLATSCR-NEXT: s_sub_i32 s5, s2, s3 +; GFX9-FLATSCR-NEXT: s_cmp_ge_u32 s2, s3 +; GFX9-FLATSCR-NEXT: s_cselect_b32 s4, s6, s4 +; GFX9-FLATSCR-NEXT: s_cselect_b32 s2, s5, s2 +; GFX9-FLATSCR-NEXT: s_add_i32 s5, s4, 1 +; GFX9-FLATSCR-NEXT: s_cmp_ge_u32 s2, s3 +; GFX9-FLATSCR-NEXT: s_cselect_b32 s2, s5, s4 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-FLATSCR-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll index 45a1afbf11992..920ff8a927e2d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll @@ -32,21 +32,21 @@ define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, float %x ; ; GFX9-LABEL: s_cvt_pkrtz_v2f16_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, s6, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, s2, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_cvt_pkrtz_v2f16_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, s6, s7 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, s2, s3 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_cvt_pkrtz_v2f16_f32: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll index 5d20a848bd6a6..17b941c59fd3f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll @@ -27,12 +27,12 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float ; ; SDAG-GFX10-LABEL: v_fcmp_f32_oeq_with_fabs: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_eq_f32_e64 s0, s6, |s7| -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; SDAG-GFX10-NEXT: v_cmp_eq_f32_e64 s2, s2, |s3| +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_oeq_with_fabs: @@ -50,12 +50,12 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float ; ; GISEL-GFX10-LABEL: v_fcmp_f32_oeq_with_fabs: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_eq_f32_e64 s0, s6, |s7| -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GISEL-GFX10-NEXT: v_cmp_eq_f32_e64 s2, s2, |s3| +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %temp = call float @llvm.fabs.f32(float %a) %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float %temp, i32 1) @@ -78,12 +78,12 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace( ; ; SDAG-GFX10-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_eq_f32_e64 s0, |s6|, |s7| -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; SDAG-GFX10-NEXT: v_cmp_eq_f32_e64 s2, |s2|, |s3| +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs: @@ -101,12 +101,12 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace( ; ; GISEL-GFX10-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_eq_f32_e64 s0, |s6|, |s7| -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GISEL-GFX10-NEXT: v_cmp_eq_f32_e64 s2, |s2|, |s3| +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %temp = call float @llvm.fabs.f32(float %a) %src_input = call float @llvm.fabs.f32(float %src) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll index 674fec1b865a6..ce055d6527996 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll @@ -30,14 +30,14 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float ; ; GFX9-LABEL: v_fcmp_f32_oeq_with_fabs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_cmp_eq_f32_e64 s[0:1], s6, |v0| -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, |v0| +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_oeq_with_fabs: @@ -88,14 +88,14 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace( ; ; GFX9-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_cmp_eq_f32_e64 s[0:1], |s6|, |v0| -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_cmp_eq_f32_e64 s[2:3], |s2|, |v0| +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll index 8fe85e49a4207..309fd99031155 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll @@ -1759,16 +1759,16 @@ define amdgpu_kernel void @v_icmp_i1_ne0(ptr addrspace(1) %out, i32 %a, i32 %b) ; ; GFX10-LABEL: v_icmp_i1_ne0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_cmp_gt_u32 s6, 1 -; GFX10-NEXT: s_cselect_b32 s0, -1, 0 -; GFX10-NEXT: s_cmp_gt_u32 s7, 2 -; GFX10-NEXT: s_cselect_b32 s1, -1, 0 -; GFX10-NEXT: s_and_b32 s0, s0, s1 -; GFX10-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: s_cmp_gt_u32 s2, 1 +; GFX10-NEXT: s_cselect_b32 s2, -1, 0 +; GFX10-NEXT: s_cmp_gt_u32 s3, 2 +; GFX10-NEXT: s_cselect_b32 s3, -1, 0 +; GFX10-NEXT: s_and_b32 s2, s2, s3 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm %c0 = icmp ugt i32 %a, 1 %c1 = icmp ugt i32 %b, 2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll index a650f999835c6..5f979e0177f58 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll @@ -1986,17 +1986,17 @@ define amdgpu_kernel void @v_icmp_i1_ne0(ptr addrspace(1) %out, i32 %a, i32 %b) ; ; GFX9-LABEL: v_icmp_i1_ne0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_cmp_gt_u32 s6, 1 -; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX9-NEXT: s_cmp_gt_u32 s7, 2 +; GFX9-NEXT: s_cmp_gt_u32 s2, 1 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_gt_u32 s3, 2 ; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: s_and_b64 s[2:3], s[4:5], s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %c0 = icmp ugt i32 %a, 1 %c1 = icmp ugt i32 %b, 2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll index b0706025f0b68..dba67a03c000e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll @@ -14,12 +14,12 @@ entry: define amdgpu_kernel void @test_iglp_opt_mfma_gemm(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { ; GCN-LABEL: test_iglp_opt_mfma_gemm: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 2.0 ; GCN-NEXT: ; iglp_opt mask(0x00000000) ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_add_u32_e32 v1, s2, v0 +; GCN-NEXT: v_add_u32_e32 v1, s0, v0 ; GCN-NEXT: v_add_u32_e32 v2, 0x6000, v1 ; GCN-NEXT: ds_read_b128 a[28:31], v2 offset:57456 ; GCN-NEXT: ds_read_b128 a[24:27], v2 offset:57440 @@ -44,7 +44,7 @@ define amdgpu_kernel void @test_iglp_opt_mfma_gemm(ptr addrspace(3) noalias %in, ; GCN-NEXT: ds_read_b128 a[152:155], v1 offset:96 ; GCN-NEXT: ds_read_b128 a[68:71], v1 offset:24592 ; GCN-NEXT: ds_read_b128 a[64:67], v1 offset:24576 -; GCN-NEXT: v_add_u32_e32 v0, s3, v0 +; GCN-NEXT: v_add_u32_e32 v0, s1, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(4) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v3, a[32:63] ; GCN-NEXT: ds_read_b128 a[148:151], v1 offset:80 @@ -80,7 +80,7 @@ define amdgpu_kernel void @test_iglp_opt_mfma_gemm(ptr addrspace(3) noalias %in, ; GCN-NEXT: ds_write_b128 v0, a[136:139] offset:32 ; GCN-NEXT: ds_write_b128 v0, a[132:135] offset:16 ; GCN-NEXT: ds_write_b128 v0, a[128:131] -; GCN-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: s_waitcnt lgkmcnt(8) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v3, a[64:95] ; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:24672 @@ -151,13 +151,13 @@ entry: define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { ; GCN-LABEL: test_iglp_opt_rev_mfma_gemm: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GCN-NEXT: v_mov_b32_e32 v2, 1.0 ; GCN-NEXT: v_mov_b32_e32 v3, 2.0 ; GCN-NEXT: ; iglp_opt mask(0x00000001) ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_add_u32_e32 v1, s2, v0 +; GCN-NEXT: v_add_u32_e32 v1, s0, v0 ; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:112 ; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:96 ; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:80 @@ -176,7 +176,7 @@ define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(ptr addrspace(3) noalias ; GCN-NEXT: ds_read_b128 a[136:139], v1 offset:8224 ; GCN-NEXT: ds_read_b128 a[132:135], v1 offset:8208 ; GCN-NEXT: ds_read_b128 a[128:131], v1 offset:8192 -; GCN-NEXT: v_add_u32_e32 v0, s3, v0 +; GCN-NEXT: v_add_u32_e32 v0, s1, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v3, a[128:159] ; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:24688 @@ -218,7 +218,7 @@ define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(ptr addrspace(3) noalias ; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32 ; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:16 ; GCN-NEXT: ds_write_b128 v0, a[0:3] -; GCN-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: ds_write_b128 v0, a[152:155] offset:8288 ; GCN-NEXT: ds_write_b128 v0, a[156:159] offset:8304 ; GCN-NEXT: ds_write_b128 v0, a[144:147] offset:8256 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll index cd92529b77165..265d64f47bb23 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll @@ -245,26 +245,24 @@ define amdgpu_kernel void @v_permlane16_b32_vvv(ptr addrspace(1) %out, i32 %src0 define amdgpu_kernel void @v_permlane16_b32_vvs(ptr addrspace(1) %out, i32 %src0, i32 %src2) { ; GFX10-SDAG-LABEL: v_permlane16_b32_vvs: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-SDAG-NEXT: s_mov_b32 null, 0 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s7 -; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vvs: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-GISEL-NEXT: s_mov_b32 null, 0 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s7 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s3 +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vvs: @@ -327,17 +325,27 @@ define amdgpu_kernel void @v_permlane16_b32_vvs(ptr addrspace(1) %out, i32 %src0 } define amdgpu_kernel void @v_permlane16_b32_vsv(ptr addrspace(1) %out, i32 %src0, i32 %src1) { -; GFX10-LABEL: v_permlane16_b32_vsv: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_mov_b32 null, 0 -; GFX10-NEXT: v_readfirstlane_b32 s0, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] -; GFX10-NEXT: s_endpgm +; GFX10-SDAG-LABEL: v_permlane16_b32_vsv: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v1 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s3, s2 +; GFX10-SDAG-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlane16_b32_vsv: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v1 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vsv: ; GFX11-SDAG: ; %bb.0: @@ -773,26 +781,24 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv(ptr addrspace(1) %out, i32 %src define amdgpu_kernel void @v_permlanex16_b32_vvs(ptr addrspace(1) %out, i32 %src0, i32 %src2) { ; GFX10-SDAG-LABEL: v_permlanex16_b32_vvs: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-SDAG-NEXT: s_mov_b32 null, 0 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s7 -; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vvs: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-GISEL-NEXT: s_mov_b32 null, 0 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s7 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s3 +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vvs: @@ -855,17 +861,27 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs(ptr addrspace(1) %out, i32 %src } define amdgpu_kernel void @v_permlanex16_b32_vsv(ptr addrspace(1) %out, i32 %src0, i32 %src1) { -; GFX10-LABEL: v_permlanex16_b32_vsv: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_mov_b32 null, 0 -; GFX10-NEXT: v_readfirstlane_b32 s0, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] -; GFX10-NEXT: s_endpgm +; GFX10-SDAG-LABEL: v_permlanex16_b32_vsv: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v1 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s3, s2 +; GFX10-SDAG-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_vsv: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v1 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vsv: ; GFX11-SDAG: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll index ae5b62ffb285b..10f09b6390aba 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll @@ -29,18 +29,18 @@ entry: define amdgpu_kernel void @test_sched_group_barrier_pipeline_READ_VALU_WRITE(ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_READ_VALU_WRITE: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v32, 7, v0 -; GCN-NEXT: ; kill: killed $sgpr4_sgpr5 +; GCN-NEXT: ; kill: killed $sgpr0_sgpr1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dwordx4 v[0:3], v32, s[4:5] -; GCN-NEXT: global_load_dwordx4 v[4:7], v32, s[4:5] offset:16 -; GCN-NEXT: global_load_dwordx4 v[8:11], v32, s[4:5] offset:32 -; GCN-NEXT: global_load_dwordx4 v[12:15], v32, s[4:5] offset:48 -; GCN-NEXT: global_load_dwordx4 v[16:19], v32, s[4:5] offset:64 -; GCN-NEXT: global_load_dwordx4 v[20:23], v32, s[4:5] offset:80 -; GCN-NEXT: global_load_dwordx4 v[24:27], v32, s[4:5] offset:96 -; GCN-NEXT: global_load_dwordx4 v[28:31], v32, s[4:5] offset:112 +; GCN-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] +; GCN-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16 +; GCN-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32 +; GCN-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48 +; GCN-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64 +; GCN-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80 +; GCN-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96 +; GCN-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112 ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(8) SyncID(0) ; GCN-NEXT: s_waitcnt vmcnt(7) ; GCN-NEXT: v_mul_lo_u32 v3, v3, v3 @@ -82,32 +82,32 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_READ_VALU_WRITE(ptr ; GCN-NEXT: v_mul_lo_u32 v30, v30, v30 ; GCN-NEXT: v_mul_lo_u32 v29, v29, v29 ; GCN-NEXT: v_mul_lo_u32 v28, v28, v28 -; GCN-NEXT: global_store_dwordx4 v32, v[28:31], s[6:7] offset:112 -; GCN-NEXT: global_store_dwordx4 v32, v[24:27], s[6:7] offset:96 -; GCN-NEXT: global_store_dwordx4 v32, v[20:23], s[6:7] offset:80 -; GCN-NEXT: global_store_dwordx4 v32, v[16:19], s[6:7] offset:64 -; GCN-NEXT: global_store_dwordx4 v32, v[12:15], s[6:7] offset:48 -; GCN-NEXT: global_store_dwordx4 v32, v[8:11], s[6:7] offset:32 -; GCN-NEXT: global_store_dwordx4 v32, v[4:7], s[6:7] offset:16 -; GCN-NEXT: global_store_dwordx4 v32, v[0:3], s[6:7] +; GCN-NEXT: global_store_dwordx4 v32, v[28:31], s[2:3] offset:112 +; GCN-NEXT: global_store_dwordx4 v32, v[24:27], s[2:3] offset:96 +; GCN-NEXT: global_store_dwordx4 v32, v[20:23], s[2:3] offset:80 +; GCN-NEXT: global_store_dwordx4 v32, v[16:19], s[2:3] offset:64 +; GCN-NEXT: global_store_dwordx4 v32, v[12:15], s[2:3] offset:48 +; GCN-NEXT: global_store_dwordx4 v32, v[8:11], s[2:3] offset:32 +; GCN-NEXT: global_store_dwordx4 v32, v[4:7], s[2:3] offset:16 +; GCN-NEXT: global_store_dwordx4 v32, v[0:3], s[2:3] ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(30) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(8) SyncID(0) ; GCN-NEXT: s_endpgm ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_READ_VALU_WRITE: ; EXACTCUTOFF: ; %bb.0: -; EXACTCUTOFF-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; EXACTCUTOFF-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v32, 7, v0 -; EXACTCUTOFF-NEXT: ; kill: killed $sgpr4_sgpr5 +; EXACTCUTOFF-NEXT: ; kill: killed $sgpr0_sgpr1 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v32, s[4:5] -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[4:7], v32, s[4:5] offset:16 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[8:11], v32, s[4:5] offset:32 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[12:15], v32, s[4:5] offset:48 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[16:19], v32, s[4:5] offset:64 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[20:23], v32, s[4:5] offset:80 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[24:27], v32, s[4:5] offset:96 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[28:31], v32, s[4:5] offset:112 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(8) SyncID(0) ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(7) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v3, v3, v3 @@ -149,14 +149,14 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_READ_VALU_WRITE(ptr ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v30, v30, v30 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v29, v29, v29 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v28, v28, v28 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[28:31], s[6:7] offset:112 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[24:27], s[6:7] offset:96 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[20:23], s[6:7] offset:80 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[16:19], s[6:7] offset:64 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[12:15], s[6:7] offset:48 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[8:11], s[6:7] offset:32 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[4:7], s[6:7] offset:16 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[0:3], s[6:7] +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[28:31], s[2:3] offset:112 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[24:27], s[2:3] offset:96 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[20:23], s[2:3] offset:80 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[16:19], s[2:3] offset:64 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[12:15], s[2:3] offset:48 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[8:11], s[2:3] offset:32 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[4:7], s[2:3] offset:16 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[0:3], s[2:3] ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(30) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(8) SyncID(0) ; EXACTCUTOFF-NEXT: s_endpgm @@ -178,17 +178,17 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_READ_VALU_WRITE(ptr define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VALU(ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v32, 7, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dwordx4 v[28:31], v32, s[4:5] offset:16 -; GCN-NEXT: global_load_dwordx4 v[8:11], v32, s[4:5] offset:96 +; GCN-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:16 +; GCN-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:96 ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_mul_lo_u32 v29, v29, v29 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_lo_u32 v9, v9, v9 -; GCN-NEXT: global_load_dwordx4 v[0:3], v32, s[4:5] +; GCN-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] ; GCN-NEXT: v_mul_lo_u32 v8, v8, v8 ; GCN-NEXT: v_mul_lo_u32 v28, v28, v28 ; GCN-NEXT: v_mul_lo_u32 v31, v31, v31 @@ -198,12 +198,12 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_lo_u32 v3, v3, v3 ; GCN-NEXT: v_mul_lo_u32 v2, v2, v2 -; GCN-NEXT: global_load_dwordx4 v[4:7], v32, s[4:5] offset:112 +; GCN-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:112 ; GCN-NEXT: v_mul_lo_u32 v1, v1, v1 ; GCN-NEXT: v_mul_lo_u32 v0, v0, v0 ; GCN-NEXT: v_mul_lo_u32 v11, v11, v11 ; GCN-NEXT: v_mul_lo_u32 v10, v10, v10 -; GCN-NEXT: global_load_dwordx4 v[12:15], v32, s[4:5] offset:48 +; GCN-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48 ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) @@ -218,11 +218,11 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_lo_u32 v13, v13, v13 ; GCN-NEXT: v_mul_lo_u32 v15, v15, v15 -; GCN-NEXT: global_load_dwordx4 v[16:19], v32, s[4:5] offset:80 +; GCN-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:80 ; GCN-NEXT: v_mul_lo_u32 v14, v14, v14 ; GCN-NEXT: v_mul_lo_u32 v12, v12, v12 -; GCN-NEXT: global_load_dwordx4 v[20:23], v32, s[4:5] offset:64 -; GCN-NEXT: global_load_dwordx4 v[24:27], v32, s[4:5] offset:32 +; GCN-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:64 +; GCN-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:32 ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) @@ -242,14 +242,14 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; GCN-NEXT: v_mul_lo_u32 v21, v21, v21 ; GCN-NEXT: v_mul_lo_u32 v20, v20, v20 ; GCN-NEXT: v_mul_lo_u32 v16, v16, v16 -; GCN-NEXT: global_store_dwordx4 v32, v[4:7], s[6:7] offset:112 -; GCN-NEXT: global_store_dwordx4 v32, v[8:11], s[6:7] offset:96 -; GCN-NEXT: global_store_dwordx4 v32, v[16:19], s[6:7] offset:80 -; GCN-NEXT: global_store_dwordx4 v32, v[20:23], s[6:7] offset:64 -; GCN-NEXT: global_store_dwordx4 v32, v[12:15], s[6:7] offset:48 -; GCN-NEXT: global_store_dwordx4 v32, v[24:27], s[6:7] offset:32 -; GCN-NEXT: global_store_dwordx4 v32, v[28:31], s[6:7] offset:16 -; GCN-NEXT: global_store_dwordx4 v32, v[0:3], s[6:7] +; GCN-NEXT: global_store_dwordx4 v32, v[4:7], s[2:3] offset:112 +; GCN-NEXT: global_store_dwordx4 v32, v[8:11], s[2:3] offset:96 +; GCN-NEXT: global_store_dwordx4 v32, v[16:19], s[2:3] offset:80 +; GCN-NEXT: global_store_dwordx4 v32, v[20:23], s[2:3] offset:64 +; GCN-NEXT: global_store_dwordx4 v32, v[12:15], s[2:3] offset:48 +; GCN-NEXT: global_store_dwordx4 v32, v[24:27], s[2:3] offset:32 +; GCN-NEXT: global_store_dwordx4 v32, v[28:31], s[2:3] offset:16 +; GCN-NEXT: global_store_dwordx4 v32, v[0:3], s[2:3] ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) @@ -258,17 +258,17 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU: ; EXACTCUTOFF: ; %bb.0: -; EXACTCUTOFF-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; EXACTCUTOFF-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v32, 7, v0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[28:31], v32, s[4:5] offset:16 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[8:11], v32, s[4:5] offset:96 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:16 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:96 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(1) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v29, v29, v29 ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v9, v9, v9 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v32, s[4:5] +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v8, v8, v8 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v28, v28, v28 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v31, v31, v31 @@ -278,12 +278,12 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v3, v3, v3 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v2, v2, v2 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[4:7], v32, s[4:5] offset:112 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:112 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v1, v1, v1 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v0, v0, v0 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v11, v11, v11 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v10, v10, v10 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[12:15], v32, s[4:5] offset:48 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) @@ -298,11 +298,11 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v13, v13, v13 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v15, v15, v15 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[16:19], v32, s[4:5] offset:80 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:80 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v14, v14, v14 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v12, v12, v12 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[20:23], v32, s[4:5] offset:64 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[24:27], v32, s[4:5] offset:32 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:64 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:32 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) @@ -322,14 +322,14 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v21, v21, v21 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v20, v20, v20 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v16, v16, v16 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[4:7], s[6:7] offset:112 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[8:11], s[6:7] offset:96 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[16:19], s[6:7] offset:80 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[20:23], s[6:7] offset:64 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[12:15], s[6:7] offset:48 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[24:27], s[6:7] offset:32 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[28:31], s[6:7] offset:16 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[0:3], s[6:7] +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[4:7], s[2:3] offset:112 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[8:11], s[2:3] offset:96 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[16:19], s[2:3] offset:80 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[20:23], s[2:3] offset:64 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[12:15], s[2:3] offset:48 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[24:27], s[2:3] offset:32 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[28:31], s[2:3] offset:16 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[0:3], s[2:3] ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) @@ -381,18 +381,18 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VALU_WRITE(ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU_WRITE: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v16, 7, v0 -; GCN-NEXT: ; kill: killed $sgpr4_sgpr5 +; GCN-NEXT: ; kill: killed $sgpr0_sgpr1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dwordx4 v[12:15], v16, s[4:5] offset:32 -; GCN-NEXT: global_load_dwordx4 v[4:7], v16, s[4:5] offset:48 +; GCN-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:32 +; GCN-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:48 ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_mul_lo_u32 v13, v13, v13 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_lo_u32 v7, v7, v7 -; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[4:5] +; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] ; GCN-NEXT: v_mul_lo_u32 v6, v6, v6 ; GCN-NEXT: v_mul_lo_u32 v12, v12, v12 ; GCN-NEXT: v_mul_lo_u32 v15, v15, v15 @@ -403,25 +403,25 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; GCN-NEXT: v_mul_lo_u32 v2, v2, v2 ; GCN-NEXT: v_mul_lo_u32 v1, v1, v1 ; GCN-NEXT: v_mul_lo_u32 v0, v0, v0 -; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] -; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[4:5] offset:112 +; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] +; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:112 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_lo_u32 v3, v3, v3 ; GCN-NEXT: v_mul_lo_u32 v2, v2, v2 ; GCN-NEXT: v_mul_lo_u32 v1, v1, v1 ; GCN-NEXT: v_mul_lo_u32 v0, v0, v0 -; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] offset:112 -; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[4:5] offset:96 +; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:112 +; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:96 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_lo_u32 v3, v3, v3 ; GCN-NEXT: v_mul_lo_u32 v2, v2, v2 ; GCN-NEXT: v_mul_lo_u32 v1, v1, v1 ; GCN-NEXT: v_mul_lo_u32 v0, v0, v0 -; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] offset:96 +; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:96 ; GCN-NEXT: v_mul_lo_u32 v5, v5, v5 ; GCN-NEXT: v_mul_lo_u32 v4, v4, v4 -; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:48 -; GCN-NEXT: global_load_dwordx4 v[4:7], v16, s[4:5] offset:64 +; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:48 +; GCN-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:64 ; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) @@ -430,9 +430,9 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; GCN-NEXT: v_mul_lo_u32 v6, v6, v6 ; GCN-NEXT: v_mul_lo_u32 v5, v5, v5 ; GCN-NEXT: v_mul_lo_u32 v4, v4, v4 -; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:64 -; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:32 -; GCN-NEXT: global_load_dwordx4 v[8:11], v16, s[4:5] offset:16 +; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:64 +; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:32 +; GCN-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:16 ; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) @@ -452,15 +452,15 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; GCN-NEXT: v_mul_lo_u32 v8, v8, v8 ; GCN-NEXT: v_mul_lo_u32 v11, v11, v11 ; GCN-NEXT: v_mul_lo_u32 v10, v10, v10 -; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:16 -; GCN-NEXT: global_load_dwordx4 v[8:11], v16, s[4:5] offset:80 +; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:16 +; GCN-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:80 ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_lo_u32 v11, v11, v11 ; GCN-NEXT: v_mul_lo_u32 v10, v10, v10 ; GCN-NEXT: v_mul_lo_u32 v9, v9, v9 ; GCN-NEXT: v_mul_lo_u32 v8, v8, v8 -; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:80 +; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:80 ; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) @@ -469,18 +469,18 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU_WRITE: ; EXACTCUTOFF: ; %bb.0: -; EXACTCUTOFF-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; EXACTCUTOFF-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v16, 7, v0 -; EXACTCUTOFF-NEXT: ; kill: killed $sgpr4_sgpr5 +; EXACTCUTOFF-NEXT: ; kill: killed $sgpr0_sgpr1 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[12:15], v16, s[4:5] offset:32 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[4:7], v16, s[4:5] offset:48 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:32 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:48 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(1) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v13, v13, v13 ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v7, v7, v7 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v16, s[4:5] +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v6, v6, v6 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v12, v12, v12 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v15, v15, v15 @@ -491,25 +491,25 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v2, v2, v2 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v1, v1, v1 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v0, v0, v0 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v16, s[4:5] offset:112 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:112 ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v3, v3, v3 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v2, v2, v2 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v1, v1, v1 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v0, v0, v0 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] offset:112 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v16, s[4:5] offset:96 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:112 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:96 ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v3, v3, v3 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v2, v2, v2 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v1, v1, v1 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v0, v0, v0 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] offset:96 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:96 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v5, v5, v5 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v4, v4, v4 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:48 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[4:7], v16, s[4:5] offset:64 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:48 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:64 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) @@ -518,9 +518,9 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v6, v6, v6 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v5, v5, v5 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v4, v4, v4 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:64 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:32 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[8:11], v16, s[4:5] offset:16 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:64 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:32 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:16 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) @@ -540,15 +540,15 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v8, v8, v8 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v11, v11, v11 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v10, v10, v10 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:16 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[8:11], v16, s[4:5] offset:80 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:16 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:80 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v11, v11, v11 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v10, v10, v10 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v9, v9, v9 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v8, v8, v8 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:80 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:80 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) @@ -614,10 +614,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_MFMA_cluster: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_add_u32_e32 v1, s2, v0 +; GCN-NEXT: v_add_u32_e32 v1, s0, v0 ; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:112 ; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:96 ; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:80 @@ -661,7 +661,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad ; GCN-NEXT: ds_read_b128 a[136:139], v2 offset:57376 ; GCN-NEXT: ds_read_b128 a[140:143], v2 offset:57392 ; GCN-NEXT: v_mov_b32_e32 v2, 2.0 -; GCN-NEXT: v_add_u32_e32 v0, s3, v0 +; GCN-NEXT: v_add_u32_e32 v0, s1, v0 ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(40) SyncID(0) ; GCN-NEXT: s_waitcnt lgkmcnt(14) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] @@ -681,7 +681,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad ; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32 ; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:16 ; GCN-NEXT: ds_write_b128 v0, a[0:3] -; GCN-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:8288 ; GCN-NEXT: ds_write_b128 v0, a[60:63] offset:8304 ; GCN-NEXT: ds_write_b128 v0, a[48:51] offset:8256 @@ -720,10 +720,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_MFMA_cluster: ; EXACTCUTOFF: ; %bb.0: ; %entry -; EXACTCUTOFF-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; EXACTCUTOFF-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s2, v0 +; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s0, v0 ; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:112 ; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:96 ; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:80 @@ -767,7 +767,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad ; EXACTCUTOFF-NEXT: ds_read_b128 a[136:139], v2 offset:57376 ; EXACTCUTOFF-NEXT: ds_read_b128 a[140:143], v2 offset:57392 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v2, 2.0 -; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s3, v0 +; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s1, v0 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(40) SyncID(0) ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(14) ; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] @@ -787,7 +787,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:32 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:16 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v0, s3 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v0, s1 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[56:59] offset:8288 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[60:63] offset:8304 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[48:51] offset:8256 @@ -862,12 +862,12 @@ entry: define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 7, v0 ; GCN-NEXT: v_mov_b32_e32 v2, 1.0 ; GCN-NEXT: v_mov_b32_e32 v3, 2.0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_add_u32_e32 v0, s2, v1 +; GCN-NEXT: v_add_u32_e32 v0, s0, v1 ; GCN-NEXT: ds_read_b128 a[28:31], v0 offset:112 ; GCN-NEXT: ds_read_b128 a[24:27], v0 offset:96 ; GCN-NEXT: ds_read_b128 a[20:23], v0 offset:80 @@ -878,7 +878,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-NEXT: ds_read_b128 a[12:15], v0 offset:48 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] -; GCN-NEXT: v_add_u32_e32 v1, s3, v1 +; GCN-NEXT: v_add_u32_e32 v1, s1, v1 ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-NEXT: s_nop 7 @@ -902,7 +902,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-NEXT: ds_read_b128 a[0:3], v0 offset:8192 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] -; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) @@ -995,12 +995,12 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave: ; EXACTCUTOFF: ; %bb.0: ; %entry -; EXACTCUTOFF-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; EXACTCUTOFF-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v1, 7, v0 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v2, 1.0 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v3, 2.0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s2, v1 +; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s0, v1 ; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v0 offset:112 ; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v0 offset:96 ; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v0 offset:80 @@ -1011,7 +1011,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v0 offset:48 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] -; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s3, v1 +; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s1, v1 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_nop 7 @@ -1035,7 +1035,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v0 offset:8192 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v1, s3 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v1, s1 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll index 4ce0ff20e3b73..3a867879bb809 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll @@ -440,59 +440,59 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; GFX900-SDAG-LABEL: s_exp_v2f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x32a5705f ; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0xc2ce8ed0 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s7, v0 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s3, v0 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v3, v2 -; GFX900-SDAG-NEXT: v_fma_f32 v4, s7, v0, -v2 +; GFX900-SDAG-NEXT: v_fma_f32 v4, s3, v0, -v2 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX900-SDAG-NEXT: v_fma_f32 v4, s7, v1, v4 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v6, s6, v0 +; GFX900-SDAG-NEXT: v_fma_f32 v4, s3, v1, v4 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v6, s2, v0 ; GFX900-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v7, v6 -; GFX900-SDAG-NEXT: v_fma_f32 v0, s6, v0, -v6 +; GFX900-SDAG-NEXT: v_fma_f32 v0, s2, v0, -v6 ; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v2, v2 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v8, v6, v7 -; GFX900-SDAG-NEXT: v_fma_f32 v0, s6, v1, v0 +; GFX900-SDAG-NEXT: v_fma_f32 v0, s2, v1, v0 ; GFX900-SDAG-NEXT: v_add_f32_e32 v0, v8, v0 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v6, v7 ; GFX900-SDAG-NEXT: v_ldexp_f32 v2, v2, v3 -; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v5 +; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s3, v5 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0x42b17218 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; GFX900-SDAG-NEXT: v_mov_b32_e32 v7, 0x7f800000 -; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s7, v3 +; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s3, v3 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v6 -; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s6, v5 +; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v5 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v3 +; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v3 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc -; GFX900-SDAG-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] +; GFX900-SDAG-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_exp_v2f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x32a5705f ; GFX900-GISEL-NEXT: v_mov_b32_e32 v6, 0x7f800000 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s6, v0 -; GFX900-GISEL-NEXT: v_fma_f32 v3, s6, v0, -v2 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s2, v0 +; GFX900-GISEL-NEXT: v_fma_f32 v3, s2, v0, -v2 ; GFX900-GISEL-NEXT: v_rndne_f32_e32 v4, v2 -; GFX900-GISEL-NEXT: v_mul_f32_e32 v5, s7, v0 -; GFX900-GISEL-NEXT: v_fma_f32 v3, s6, v1, v3 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v5, s3, v0 +; GFX900-GISEL-NEXT: v_fma_f32 v3, s2, v1, v3 ; GFX900-GISEL-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX900-GISEL-NEXT: v_fma_f32 v0, s7, v0, -v5 +; GFX900-GISEL-NEXT: v_fma_f32 v0, s3, v0, -v5 ; GFX900-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 -; GFX900-GISEL-NEXT: v_fma_f32 v0, s7, v1, v0 +; GFX900-GISEL-NEXT: v_fma_f32 v0, s3, v1, v0 ; GFX900-GISEL-NEXT: v_rndne_f32_e32 v1, v5 ; GFX900-GISEL-NEXT: v_cvt_i32_f32_e32 v3, v4 ; GFX900-GISEL-NEXT: v_exp_f32_e32 v2, v2 @@ -502,18 +502,18 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; GFX900-GISEL-NEXT: v_exp_f32_e32 v5, v0 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0xc2ce8ed0 ; GFX900-GISEL-NEXT: v_ldexp_f32 v2, v2, v3 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v4 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v4 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x42b17218 ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc -; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s6, v3 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v3 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc ; GFX900-GISEL-NEXT: v_ldexp_f32 v1, v5, v1 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s7, v4 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s3, v4 ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc -; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s7, v3 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s3, v3 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX900-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX900-GISEL-NEXT: s_endpgm ; ; SI-SDAG-LABEL: s_exp_v2f32: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll index 5ab960f47f57b..a162949587481 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll @@ -442,59 +442,59 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX900-SDAG-LABEL: s_exp10_v2f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x33979a37 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0xc23369f4 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s7, v0 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s3, v0 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v3, v2 -; GFX900-SDAG-NEXT: v_fma_f32 v4, s7, v0, -v2 +; GFX900-SDAG-NEXT: v_fma_f32 v4, s3, v0, -v2 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX900-SDAG-NEXT: v_fma_f32 v4, s7, v1, v4 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v6, s6, v0 +; GFX900-SDAG-NEXT: v_fma_f32 v4, s3, v1, v4 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v6, s2, v0 ; GFX900-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v7, v6 -; GFX900-SDAG-NEXT: v_fma_f32 v0, s6, v0, -v6 +; GFX900-SDAG-NEXT: v_fma_f32 v0, s2, v0, -v6 ; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v2, v2 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v8, v6, v7 -; GFX900-SDAG-NEXT: v_fma_f32 v0, s6, v1, v0 +; GFX900-SDAG-NEXT: v_fma_f32 v0, s2, v1, v0 ; GFX900-SDAG-NEXT: v_add_f32_e32 v0, v8, v0 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v6, v7 ; GFX900-SDAG-NEXT: v_ldexp_f32 v2, v2, v3 -; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v5 +; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s3, v5 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0x421a209b ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; GFX900-SDAG-NEXT: v_mov_b32_e32 v7, 0x7f800000 -; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s7, v3 +; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s3, v3 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v6 -; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s6, v5 +; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v5 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v3 +; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v3 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc -; GFX900-SDAG-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] +; GFX900-SDAG-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_exp10_v2f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x33979a37 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v6, 0x7f800000 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s6, v0 -; GFX900-GISEL-NEXT: v_fma_f32 v3, s6, v0, -v2 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s2, v0 +; GFX900-GISEL-NEXT: v_fma_f32 v3, s2, v0, -v2 ; GFX900-GISEL-NEXT: v_rndne_f32_e32 v4, v2 -; GFX900-GISEL-NEXT: v_mul_f32_e32 v5, s7, v0 -; GFX900-GISEL-NEXT: v_fma_f32 v3, s6, v1, v3 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v5, s3, v0 +; GFX900-GISEL-NEXT: v_fma_f32 v3, s2, v1, v3 ; GFX900-GISEL-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX900-GISEL-NEXT: v_fma_f32 v0, s7, v0, -v5 +; GFX900-GISEL-NEXT: v_fma_f32 v0, s3, v0, -v5 ; GFX900-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 -; GFX900-GISEL-NEXT: v_fma_f32 v0, s7, v1, v0 +; GFX900-GISEL-NEXT: v_fma_f32 v0, s3, v1, v0 ; GFX900-GISEL-NEXT: v_rndne_f32_e32 v1, v5 ; GFX900-GISEL-NEXT: v_cvt_i32_f32_e32 v3, v4 ; GFX900-GISEL-NEXT: v_exp_f32_e32 v2, v2 @@ -504,18 +504,18 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; GFX900-GISEL-NEXT: v_exp_f32_e32 v5, v0 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0xc23369f4 ; GFX900-GISEL-NEXT: v_ldexp_f32 v2, v2, v3 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v4 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v4 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x421a209b ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc -; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s6, v3 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v3 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc ; GFX900-GISEL-NEXT: v_ldexp_f32 v1, v5, v1 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s7, v4 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s3, v4 ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc -; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s7, v3 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s3, v3 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX900-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX900-GISEL-NEXT: s_endpgm ; ; SI-SDAG-LABEL: s_exp10_v2f32: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll index 6cca705f7b1db..36e78975cdb01 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll @@ -270,25 +270,25 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX900-SDAG-LABEL: s_exp2_v2f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v3, 1.0, v1, vcc ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v2, vcc -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc -; GFX900-SDAG-NEXT: v_add_f32_e32 v4, s7, v4 -; GFX900-SDAG-NEXT: v_add_f32_e32 v1, s6, v1 +; GFX900-SDAG-NEXT: v_add_f32_e32 v4, s3, v4 +; GFX900-SDAG-NEXT: v_add_f32_e32 v1, s2, v1 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v4, v4 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v2, v1 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, v4, v3 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, v2, v0 -; GFX900-SDAG-NEXT: global_store_dwordx2 v5, v[0:1], s[4:5] +; GFX900-SDAG-NEXT: global_store_dwordx2 v5, v[0:1], s[0:1] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_exp2_v2f32: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll index 90a15ae8d9b28..035b2439eff15 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll @@ -313,25 +313,25 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX900-SDAG-LABEL: s_log2_v2f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v1, vcc ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v4, 1.0, v2, vcc -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e32 v4, s7, v4 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, s6, v1 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v4, s3, v4 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, s2, v1 ; GFX900-SDAG-NEXT: v_log_f32_e32 v4, v4 ; GFX900-SDAG-NEXT: v_log_f32_e32 v2, v1 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v4, v3 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v2, v0 -; GFX900-SDAG-NEXT: global_store_dwordx2 v5, v[0:1], s[4:5] +; GFX900-SDAG-NEXT: global_store_dwordx2 v5, v[0:1], s[0:1] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_log2_v2f32: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.ll index 8196999b8f1f1..7ad7cc821c1b5 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.round.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.round.ll @@ -133,57 +133,31 @@ define amdgpu_kernel void @round_v2f32(ptr addrspace(1) %out, <2 x float> %in) # ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; -; GFX8-LABEL: round_v2f32: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX8-NEXT: s_brev_b32 s8, -2 -; GFX8-NEXT: s_mov_b32 s7, 0xf000 -; GFX8-NEXT: s_mov_b32 s6, -1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_trunc_f32_e32 v0, s3 -; GFX8-NEXT: v_sub_f32_e32 v1, s3, v0 -; GFX8-NEXT: s_mov_b32 s4, s0 -; GFX8-NEXT: s_mov_b32 s5, s1 -; GFX8-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 -; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v2, s3 -; GFX8-NEXT: v_bfi_b32 v1, s8, v1, v2 -; GFX8-NEXT: v_add_f32_e32 v1, v0, v1 -; GFX8-NEXT: v_trunc_f32_e32 v0, s2 -; GFX8-NEXT: v_sub_f32_e32 v2, s2, v0 -; GFX8-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, 0.5 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v3, s2 -; GFX8-NEXT: v_bfi_b32 v2, s8, v2, v3 -; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; GFX8-NEXT: s_endpgm -; -; GFX9-LABEL: round_v2f32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_brev_b32 s8, -2 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_trunc_f32_e32 v0, s7 -; GFX9-NEXT: v_sub_f32_e32 v1, s7, v0 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5 -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-NEXT: v_bfi_b32 v1, s8, v1, v2 -; GFX9-NEXT: v_add_f32_e32 v1, v0, v1 -; GFX9-NEXT: v_trunc_f32_e32 v0, s6 -; GFX9-NEXT: v_sub_f32_e32 v2, s6, v0 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-NEXT: v_bfi_b32 v2, s8, v2, v3 -; GFX9-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX9-NEXT: s_endpgm +; GFX89-LABEL: round_v2f32: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX89-NEXT: s_brev_b32 s8, -2 +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: v_trunc_f32_e32 v0, s3 +; GFX89-NEXT: v_sub_f32_e32 v1, s3, v0 +; GFX89-NEXT: s_mov_b32 s4, s0 +; GFX89-NEXT: s_mov_b32 s5, s1 +; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 +; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] +; GFX89-NEXT: v_mov_b32_e32 v2, s3 +; GFX89-NEXT: v_bfi_b32 v1, s8, v1, v2 +; GFX89-NEXT: v_add_f32_e32 v1, v0, v1 +; GFX89-NEXT: v_trunc_f32_e32 v0, s2 +; GFX89-NEXT: v_sub_f32_e32 v2, s2, v0 +; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, 0.5 +; GFX89-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[0:1] +; GFX89-NEXT: v_mov_b32_e32 v3, s2 +; GFX89-NEXT: v_bfi_b32 v2, s8, v2, v3 +; GFX89-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX89-NEXT: s_endpgm ; ; GFX11-LABEL: round_v2f32: ; GFX11: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll index a54405bf1b471..994ef22539a65 100644 --- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll @@ -8,12 +8,12 @@ define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 { ; GFX9-LABEL: s_lshr_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_pk_lshrrev_b16 v1, s7, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_pk_lshrrev_b16 v1, s3, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: s_lshr_v2i16: @@ -54,11 +54,11 @@ define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, < ; ; GFX10-LABEL: s_lshr_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_lshrrev_b16 v1, s7, s6 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: v_pk_lshrrev_b16 v1, s3, s2 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_lshr_v2i16: diff --git a/llvm/test/CodeGen/AMDGPU/madak.ll b/llvm/test/CodeGen/AMDGPU/madak.ll index 944db3d3adc3a..9ec37a5e14cdf 100644 --- a/llvm/test/CodeGen/AMDGPU/madak.ll +++ b/llvm/test/CodeGen/AMDGPU/madak.ll @@ -220,40 +220,40 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad ; ; GFX9-LABEL: madak_2_use_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x41200000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[6:7] offset:8 glc +; GFX9-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_madak_f32 v2, v1, v2, 0x41200000 ; GFX9-NEXT: v_mac_f32_e32 v4, v1, v3 -; GFX9-NEXT: global_store_dword v0, v2, s[4:5] +; GFX9-NEXT: global_store_dword v0, v2, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v0, v4, s[6:7] offset:4 +; GFX9-NEXT: global_store_dword v0, v4, s[2:3] offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; ; GFX10-MAD-LABEL: madak_2_use_f32: ; GFX10-MAD: ; %bb.0: -; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-MAD-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-MAD-NEXT: global_load_dword v1, v0, s[6:7] glc dlc +; GFX10-MAD-NEXT: global_load_dword v1, v0, s[2:3] glc dlc ; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) -; GFX10-MAD-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc dlc +; GFX10-MAD-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc ; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) -; GFX10-MAD-NEXT: global_load_dword v3, v0, s[6:7] offset:8 glc dlc +; GFX10-MAD-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc dlc ; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) ; GFX10-MAD-NEXT: v_madak_f32 v2, v1, v2, 0x41200000 ; GFX10-MAD-NEXT: v_madak_f32 v1, v1, v3, 0x41200000 -; GFX10-MAD-NEXT: global_store_dword v0, v2, s[4:5] +; GFX10-MAD-NEXT: global_store_dword v0, v2, s[0:1] ; GFX10-MAD-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-MAD-NEXT: global_store_dword v0, v1, s[6:7] offset:4 +; GFX10-MAD-NEXT: global_store_dword v0, v1, s[2:3] offset:4 ; GFX10-MAD-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-MAD-NEXT: s_endpgm ; @@ -282,40 +282,40 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad ; ; GFX940-FMA-LABEL: madak_2_use_f32: ; GFX940-FMA: ; %bb.0: -; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-FMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-FMA-NEXT: v_mov_b32_e32 v4, 0x41200000 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-FMA-NEXT: global_load_dword v1, v0, s[6:7] sc0 sc1 +; GFX940-FMA-NEXT: global_load_dword v1, v0, s[2:3] sc0 sc1 ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX940-FMA-NEXT: global_load_dword v2, v0, s[6:7] offset:4 sc0 sc1 +; GFX940-FMA-NEXT: global_load_dword v2, v0, s[2:3] offset:4 sc0 sc1 ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX940-FMA-NEXT: global_load_dword v3, v0, s[6:7] offset:8 sc0 sc1 +; GFX940-FMA-NEXT: global_load_dword v3, v0, s[2:3] offset:8 sc0 sc1 ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX940-FMA-NEXT: v_fmaak_f32 v2, v1, v2, 0x41200000 ; GFX940-FMA-NEXT: v_fmac_f32_e32 v4, v1, v3 -; GFX940-FMA-NEXT: global_store_dword v0, v2, s[4:5] sc0 sc1 +; GFX940-FMA-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX940-FMA-NEXT: global_store_dword v0, v4, s[6:7] offset:4 sc0 sc1 +; GFX940-FMA-NEXT: global_store_dword v0, v4, s[2:3] offset:4 sc0 sc1 ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX940-FMA-NEXT: s_endpgm ; ; GFX10-FMA-LABEL: madak_2_use_f32: ; GFX10-FMA: ; %bb.0: -; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-FMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-FMA-NEXT: global_load_dword v1, v0, s[6:7] glc dlc +; GFX10-FMA-NEXT: global_load_dword v1, v0, s[2:3] glc dlc ; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX10-FMA-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc dlc +; GFX10-FMA-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc ; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX10-FMA-NEXT: global_load_dword v3, v0, s[6:7] offset:8 glc dlc +; GFX10-FMA-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc dlc ; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX10-FMA-NEXT: v_fmaak_f32 v2, v1, v2, 0x41200000 ; GFX10-FMA-NEXT: v_fmaak_f32 v1, v1, v3, 0x41200000 -; GFX10-FMA-NEXT: global_store_dword v0, v2, s[4:5] +; GFX10-FMA-NEXT: global_store_dword v0, v2, s[0:1] ; GFX10-FMA-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-FMA-NEXT: global_store_dword v0, v1, s[6:7] offset:4 +; GFX10-FMA-NEXT: global_store_dword v0, v1, s[2:3] offset:4 ; GFX10-FMA-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-FMA-NEXT: s_endpgm ; @@ -398,24 +398,24 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out, ; ; GFX9-LABEL: madak_m_inline_imm_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_madak_f32 v1, 4.0, v1, 0x41200000 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-MAD-LABEL: madak_m_inline_imm_f32: ; GFX10-MAD: ; %bb.0: -; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-MAD-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-MAD-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-MAD-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) ; GFX10-MAD-NEXT: v_madak_f32 v1, 4.0, v1, 0x41200000 -; GFX10-MAD-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-MAD-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-MAD-NEXT: s_endpgm ; ; GFX11-MAD-LABEL: madak_m_inline_imm_f32: @@ -435,24 +435,24 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out, ; ; GFX940-FMA-LABEL: madak_m_inline_imm_f32: ; GFX940-FMA: ; %bb.0: -; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-FMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-FMA-NEXT: global_load_dword v1, v0, s[6:7] +; GFX940-FMA-NEXT: global_load_dword v1, v0, s[2:3] ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX940-FMA-NEXT: v_fmaak_f32 v1, 4.0, v1, 0x41200000 -; GFX940-FMA-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 +; GFX940-FMA-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 ; GFX940-FMA-NEXT: s_endpgm ; ; GFX10-FMA-LABEL: madak_m_inline_imm_f32: ; GFX10-FMA: ; %bb.0: -; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-FMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-FMA-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-FMA-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX10-FMA-NEXT: v_fmaak_f32 v1, 4.0, v1, 0x41200000 -; GFX10-FMA-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-FMA-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-FMA-NEXT: s_endpgm ; ; GFX11-FMA-LABEL: madak_m_inline_imm_f32: @@ -931,23 +931,23 @@ define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float ; ; GFX9-LABEL: s_s_madak_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x41200000 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-NEXT: v_mac_f32_e32 v1, s6, v2 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: v_mac_f32_e32 v1, s2, v2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-MAD-LABEL: s_s_madak_f32: ; GFX10-MAD: ; %bb.0: -; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-MAD-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-MAD-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-MAD-NEXT: v_mov_b32_e32 v0, s7 -; GFX10-MAD-NEXT: v_madak_f32 v0, s6, v0, 0x41200000 -; GFX10-MAD-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-MAD-NEXT: v_mov_b32_e32 v0, s3 +; GFX10-MAD-NEXT: v_madak_f32 v0, s2, v0, 0x41200000 +; GFX10-MAD-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-MAD-NEXT: s_endpgm ; ; GFX11-MAD-LABEL: s_s_madak_f32: @@ -964,23 +964,23 @@ define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float ; ; GFX940-FMA-LABEL: s_s_madak_f32: ; GFX940-FMA: ; %bb.0: -; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-FMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX940-FMA-NEXT: v_mov_b32_e32 v1, 0x41200000 ; GFX940-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-FMA-NEXT: v_mov_b32_e32 v2, s7 -; GFX940-FMA-NEXT: v_fmac_f32_e32 v1, s6, v2 -; GFX940-FMA-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 +; GFX940-FMA-NEXT: v_mov_b32_e32 v2, s3 +; GFX940-FMA-NEXT: v_fmac_f32_e32 v1, s2, v2 +; GFX940-FMA-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 ; GFX940-FMA-NEXT: s_endpgm ; ; GFX10-FMA-LABEL: s_s_madak_f32: ; GFX10-FMA: ; %bb.0: -; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-FMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-FMA-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-FMA-NEXT: v_mov_b32_e32 v0, s7 -; GFX10-FMA-NEXT: v_fmaak_f32 v0, s6, v0, 0x41200000 -; GFX10-FMA-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-FMA-NEXT: v_mov_b32_e32 v0, s3 +; GFX10-FMA-NEXT: v_fmaak_f32 v0, s2, v0, 0x41200000 +; GFX10-FMA-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-FMA-NEXT: s_endpgm ; ; GFX11-FMA-LABEL: s_s_madak_f32: diff --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll b/llvm/test/CodeGen/AMDGPU/memory_clause.ll index 86a5055ab0704..940287d44d8d1 100644 --- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll +++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll @@ -5,21 +5,21 @@ define amdgpu_kernel void @vector_clause(ptr addrspace(1) noalias nocapture readonly %arg, ptr addrspace(1) noalias nocapture %arg1) { ; GCN-LABEL: vector_clause: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v16, 4, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[4:5] -; GCN-NEXT: global_load_dwordx4 v[4:7], v16, s[4:5] offset:16 -; GCN-NEXT: global_load_dwordx4 v[8:11], v16, s[4:5] offset:32 -; GCN-NEXT: global_load_dwordx4 v[12:15], v16, s[4:5] offset:48 +; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] +; GCN-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16 +; GCN-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32 +; GCN-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48 ; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] +; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] ; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16 +; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:16 ; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32 +; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:32 ; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48 +; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:48 ; GCN-NEXT: s_endpgm ; ; GCN-SCRATCH-LABEL: vector_clause: diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll index c98cfa08160ca..9d6e0927b0dfd 100644 --- a/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll +++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll @@ -8,17 +8,17 @@ declare i64 @llvm.cttz.i64(i64, i1) nounwind readnone define amdgpu_kernel void @ctlz_i64_poison(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; GFX9-LABEL: ctlz_i64_poison: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v0, v1, s[6:7] offset:5 -; GFX9-NEXT: global_load_ubyte v2, v1, s[6:7] offset:6 -; GFX9-NEXT: global_load_ubyte v3, v1, s[6:7] offset:7 -; GFX9-NEXT: global_load_ubyte v4, v1, s[6:7] offset:1 -; GFX9-NEXT: global_load_ubyte v5, v1, s[6:7] offset:3 -; GFX9-NEXT: global_load_ubyte v6, v1, s[6:7] offset:4 -; GFX9-NEXT: global_load_ubyte v7, v1, s[6:7] -; GFX9-NEXT: global_load_ubyte v8, v1, s[6:7] offset:2 +; GFX9-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5 +; GFX9-NEXT: global_load_ubyte v2, v1, s[2:3] offset:6 +; GFX9-NEXT: global_load_ubyte v3, v1, s[2:3] offset:7 +; GFX9-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1 +; GFX9-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3 +; GFX9-NEXT: global_load_ubyte v6, v1, s[2:3] offset:4 +; GFX9-NEXT: global_load_ubyte v7, v1, s[2:3] +; GFX9-NEXT: global_load_ubyte v8, v1, s[2:3] offset:2 ; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(5) @@ -40,23 +40,23 @@ define amdgpu_kernel void @ctlz_i64_poison(ptr addrspace(1) noalias %out, ptr ad ; GFX9-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX9-NEXT: v_add_u32_e64 v2, v2, 32 clamp ; GFX9-NEXT: v_min_u32_e32 v0, v2, v0 -; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: ctlz_i64_poison: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x7 -; GFX10-NEXT: global_load_ubyte v0, v1, s[6:7] offset:5 -; GFX10-NEXT: global_load_ubyte v2, v1, s[6:7] offset:6 -; GFX10-NEXT: global_load_ubyte v3, v1, s[6:7] offset:7 -; GFX10-NEXT: global_load_ubyte v4, v1, s[6:7] offset:1 -; GFX10-NEXT: global_load_ubyte v5, v1, s[6:7] offset:3 -; GFX10-NEXT: global_load_ubyte v6, v1, s[6:7] -; GFX10-NEXT: global_load_ubyte v7, v1, s[6:7] offset:2 -; GFX10-NEXT: global_load_ubyte v8, v1, s[6:7] offset:4 +; GFX10-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5 +; GFX10-NEXT: global_load_ubyte v2, v1, s[2:3] offset:6 +; GFX10-NEXT: global_load_ubyte v3, v1, s[2:3] offset:7 +; GFX10-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1 +; GFX10-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3 +; GFX10-NEXT: global_load_ubyte v6, v1, s[2:3] +; GFX10-NEXT: global_load_ubyte v7, v1, s[2:3] offset:2 +; GFX10-NEXT: global_load_ubyte v8, v1, s[2:3] offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(7) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX10-NEXT: s_waitcnt vmcnt(5) @@ -76,7 +76,7 @@ define amdgpu_kernel void @ctlz_i64_poison(ptr addrspace(1) noalias %out, ptr ad ; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX10-NEXT: v_add_nc_u32_e64 v2, v2, 32 clamp ; GFX10-NEXT: v_min_u32_e32 v0, v2, v0 -; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm %val = load i64, ptr addrspace(1) %arrayidx, align 1 %ctlz = tail call i64 @llvm.ctlz.i64(i64 %val, i1 true) nounwind readnone @@ -87,17 +87,17 @@ define amdgpu_kernel void @ctlz_i64_poison(ptr addrspace(1) noalias %out, ptr ad define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; GFX9-LABEL: ctlz_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v0, v1, s[6:7] offset:5 -; GFX9-NEXT: global_load_ubyte v2, v1, s[6:7] offset:6 -; GFX9-NEXT: global_load_ubyte v3, v1, s[6:7] offset:7 -; GFX9-NEXT: global_load_ubyte v4, v1, s[6:7] offset:1 -; GFX9-NEXT: global_load_ubyte v5, v1, s[6:7] offset:3 -; GFX9-NEXT: global_load_ubyte v6, v1, s[6:7] offset:4 -; GFX9-NEXT: global_load_ubyte v7, v1, s[6:7] -; GFX9-NEXT: global_load_ubyte v8, v1, s[6:7] offset:2 +; GFX9-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5 +; GFX9-NEXT: global_load_ubyte v2, v1, s[2:3] offset:6 +; GFX9-NEXT: global_load_ubyte v3, v1, s[2:3] offset:7 +; GFX9-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1 +; GFX9-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3 +; GFX9-NEXT: global_load_ubyte v6, v1, s[2:3] offset:4 +; GFX9-NEXT: global_load_ubyte v7, v1, s[2:3] +; GFX9-NEXT: global_load_ubyte v8, v1, s[2:3] offset:2 ; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(5) @@ -120,23 +120,23 @@ define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace ; GFX9-NEXT: v_add_u32_e64 v2, v2, 32 clamp ; GFX9-NEXT: v_min_u32_e32 v0, v2, v0 ; GFX9-NEXT: v_min_u32_e32 v0, 64, v0 -; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: ctlz_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x7 -; GFX10-NEXT: global_load_ubyte v0, v1, s[6:7] offset:5 -; GFX10-NEXT: global_load_ubyte v2, v1, s[6:7] offset:6 -; GFX10-NEXT: global_load_ubyte v3, v1, s[6:7] offset:7 -; GFX10-NEXT: global_load_ubyte v4, v1, s[6:7] offset:1 -; GFX10-NEXT: global_load_ubyte v5, v1, s[6:7] offset:3 -; GFX10-NEXT: global_load_ubyte v6, v1, s[6:7] -; GFX10-NEXT: global_load_ubyte v7, v1, s[6:7] offset:2 -; GFX10-NEXT: global_load_ubyte v8, v1, s[6:7] offset:4 +; GFX10-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5 +; GFX10-NEXT: global_load_ubyte v2, v1, s[2:3] offset:6 +; GFX10-NEXT: global_load_ubyte v3, v1, s[2:3] offset:7 +; GFX10-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1 +; GFX10-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3 +; GFX10-NEXT: global_load_ubyte v6, v1, s[2:3] +; GFX10-NEXT: global_load_ubyte v7, v1, s[2:3] offset:2 +; GFX10-NEXT: global_load_ubyte v8, v1, s[2:3] offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(7) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX10-NEXT: s_waitcnt vmcnt(5) @@ -157,7 +157,7 @@ define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace ; GFX10-NEXT: v_add_nc_u32_e64 v2, v2, 32 clamp ; GFX10-NEXT: v_min_u32_e32 v0, v2, v0 ; GFX10-NEXT: v_min_u32_e32 v0, 64, v0 -; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm %val = load i64, ptr addrspace(1) %arrayidx, align 1 %ctlz = tail call i64 @llvm.ctlz.i64(i64 %val, i1 false) nounwind readnone @@ -168,17 +168,17 @@ define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace define amdgpu_kernel void @cttz_i64_poison(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; GFX9-LABEL: cttz_i64_poison: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v0, v1, s[6:7] offset:5 -; GFX9-NEXT: global_load_ubyte v2, v1, s[6:7] offset:6 -; GFX9-NEXT: global_load_ubyte v3, v1, s[6:7] offset:7 -; GFX9-NEXT: global_load_ubyte v4, v1, s[6:7] offset:1 -; GFX9-NEXT: global_load_ubyte v5, v1, s[6:7] offset:3 -; GFX9-NEXT: global_load_ubyte v6, v1, s[6:7] offset:4 -; GFX9-NEXT: global_load_ubyte v7, v1, s[6:7] -; GFX9-NEXT: global_load_ubyte v8, v1, s[6:7] offset:2 +; GFX9-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5 +; GFX9-NEXT: global_load_ubyte v2, v1, s[2:3] offset:6 +; GFX9-NEXT: global_load_ubyte v3, v1, s[2:3] offset:7 +; GFX9-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1 +; GFX9-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3 +; GFX9-NEXT: global_load_ubyte v6, v1, s[2:3] offset:4 +; GFX9-NEXT: global_load_ubyte v7, v1, s[2:3] +; GFX9-NEXT: global_load_ubyte v8, v1, s[2:3] offset:2 ; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(5) @@ -200,23 +200,23 @@ define amdgpu_kernel void @cttz_i64_poison(ptr addrspace(1) noalias %out, ptr ad ; GFX9-NEXT: v_ffbl_b32_e32 v2, v2 ; GFX9-NEXT: v_add_u32_e64 v0, v0, 32 clamp ; GFX9-NEXT: v_min_u32_e32 v0, v0, v2 -; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: cttz_i64_poison: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x7 -; GFX10-NEXT: global_load_ubyte v0, v1, s[6:7] offset:5 -; GFX10-NEXT: global_load_ubyte v2, v1, s[6:7] offset:7 -; GFX10-NEXT: global_load_ubyte v3, v1, s[6:7] offset:6 -; GFX10-NEXT: global_load_ubyte v4, v1, s[6:7] offset:1 -; GFX10-NEXT: global_load_ubyte v5, v1, s[6:7] offset:3 -; GFX10-NEXT: global_load_ubyte v6, v1, s[6:7] offset:4 -; GFX10-NEXT: global_load_ubyte v7, v1, s[6:7] -; GFX10-NEXT: global_load_ubyte v8, v1, s[6:7] offset:2 +; GFX10-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5 +; GFX10-NEXT: global_load_ubyte v2, v1, s[2:3] offset:7 +; GFX10-NEXT: global_load_ubyte v3, v1, s[2:3] offset:6 +; GFX10-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1 +; GFX10-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3 +; GFX10-NEXT: global_load_ubyte v6, v1, s[2:3] offset:4 +; GFX10-NEXT: global_load_ubyte v7, v1, s[2:3] +; GFX10-NEXT: global_load_ubyte v8, v1, s[2:3] offset:2 ; GFX10-NEXT: s_waitcnt vmcnt(7) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX10-NEXT: s_waitcnt vmcnt(6) @@ -238,7 +238,7 @@ define amdgpu_kernel void @cttz_i64_poison(ptr addrspace(1) noalias %out, ptr ad ; GFX10-NEXT: v_ffbl_b32_e32 v2, v2 ; GFX10-NEXT: v_add_nc_u32_e64 v0, v0, 32 clamp ; GFX10-NEXT: v_min_u32_e32 v0, v0, v2 -; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm %val = load i64, ptr addrspace(1) %arrayidx, align 1 %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) nounwind readnone @@ -249,17 +249,17 @@ define amdgpu_kernel void @cttz_i64_poison(ptr addrspace(1) noalias %out, ptr ad define amdgpu_kernel void @cttz_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; GFX9-LABEL: cttz_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v0, v1, s[6:7] offset:5 -; GFX9-NEXT: global_load_ubyte v2, v1, s[6:7] offset:6 -; GFX9-NEXT: global_load_ubyte v3, v1, s[6:7] offset:7 -; GFX9-NEXT: global_load_ubyte v4, v1, s[6:7] offset:1 -; GFX9-NEXT: global_load_ubyte v5, v1, s[6:7] offset:3 -; GFX9-NEXT: global_load_ubyte v6, v1, s[6:7] offset:4 -; GFX9-NEXT: global_load_ubyte v7, v1, s[6:7] -; GFX9-NEXT: global_load_ubyte v8, v1, s[6:7] offset:2 +; GFX9-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5 +; GFX9-NEXT: global_load_ubyte v2, v1, s[2:3] offset:6 +; GFX9-NEXT: global_load_ubyte v3, v1, s[2:3] offset:7 +; GFX9-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1 +; GFX9-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3 +; GFX9-NEXT: global_load_ubyte v6, v1, s[2:3] offset:4 +; GFX9-NEXT: global_load_ubyte v7, v1, s[2:3] +; GFX9-NEXT: global_load_ubyte v8, v1, s[2:3] offset:2 ; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(5) @@ -282,23 +282,23 @@ define amdgpu_kernel void @cttz_i64(ptr addrspace(1) noalias %out, ptr addrspace ; GFX9-NEXT: v_add_u32_e64 v0, v0, 32 clamp ; GFX9-NEXT: v_min_u32_e32 v0, v0, v2 ; GFX9-NEXT: v_min_u32_e32 v0, 64, v0 -; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: cttz_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x7 -; GFX10-NEXT: global_load_ubyte v0, v1, s[6:7] offset:5 -; GFX10-NEXT: global_load_ubyte v2, v1, s[6:7] offset:7 -; GFX10-NEXT: global_load_ubyte v3, v1, s[6:7] offset:6 -; GFX10-NEXT: global_load_ubyte v4, v1, s[6:7] offset:1 -; GFX10-NEXT: global_load_ubyte v5, v1, s[6:7] offset:3 -; GFX10-NEXT: global_load_ubyte v6, v1, s[6:7] offset:4 -; GFX10-NEXT: global_load_ubyte v7, v1, s[6:7] -; GFX10-NEXT: global_load_ubyte v8, v1, s[6:7] offset:2 +; GFX10-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5 +; GFX10-NEXT: global_load_ubyte v2, v1, s[2:3] offset:7 +; GFX10-NEXT: global_load_ubyte v3, v1, s[2:3] offset:6 +; GFX10-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1 +; GFX10-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3 +; GFX10-NEXT: global_load_ubyte v6, v1, s[2:3] offset:4 +; GFX10-NEXT: global_load_ubyte v7, v1, s[2:3] +; GFX10-NEXT: global_load_ubyte v8, v1, s[2:3] offset:2 ; GFX10-NEXT: s_waitcnt vmcnt(7) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX10-NEXT: s_waitcnt vmcnt(6) @@ -321,7 +321,7 @@ define amdgpu_kernel void @cttz_i64(ptr addrspace(1) noalias %out, ptr addrspace ; GFX10-NEXT: v_add_nc_u32_e64 v0, v0, 32 clamp ; GFX10-NEXT: v_min_u32_e32 v0, v0, v2 ; GFX10-NEXT: v_min_u32_e32 v0, 64, v0 -; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm %val = load i64, ptr addrspace(1) %arrayidx, align 1 %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 false) nounwind readnone diff --git a/llvm/test/CodeGen/AMDGPU/mul_int24.ll b/llvm/test/CodeGen/AMDGPU/mul_int24.ll index 58fd4b9bd2fee..357b851a8f56f 100644 --- a/llvm/test/CodeGen/AMDGPU/mul_int24.ll +++ b/llvm/test/CodeGen/AMDGPU/mul_int24.ll @@ -39,17 +39,17 @@ define amdgpu_kernel void @test_smul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b ; ; GFX9-LABEL: test_smul24_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_bfe_i32 s4, s6, 0x180000 -; GFX9-NEXT: s_bfe_i32 s5, s7, 0x180000 -; GFX9-NEXT: s_mul_i32 s4, s4, s5 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_bfe_i32 s0, s2, 0x180000 +; GFX9-NEXT: s_bfe_i32 s1, s3, 0x180000 +; GFX9-NEXT: s_mul_i32 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: test_smul24_i32: @@ -126,17 +126,17 @@ define amdgpu_kernel void @test_smulhi24_i64(ptr addrspace(1) %out, i32 %a, i32 ; ; GFX9-LABEL: test_smulhi24_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_bfe_i32 s4, s6, 0x180000 -; GFX9-NEXT: s_bfe_i32 s5, s7, 0x180000 -; GFX9-NEXT: s_mul_hi_i32 s4, s4, s5 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_bfe_i32 s0, s2, 0x180000 +; GFX9-NEXT: s_bfe_i32 s1, s3, 0x180000 +; GFX9-NEXT: s_mul_hi_i32 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: test_smulhi24_i64: diff --git a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll index 698a54de108f7..3a16c88f32cc3 100644 --- a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll +++ b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll @@ -39,17 +39,17 @@ define amdgpu_kernel void @test_umul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b ; ; GFX9-LABEL: test_umul24_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_and_b32 s4, s6, 0xffffff -; GFX9-NEXT: s_and_b32 s5, s7, 0xffffff -; GFX9-NEXT: s_mul_i32 s4, s4, s5 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_and_b32 s0, s2, 0xffffff +; GFX9-NEXT: s_and_b32 s1, s3, 0xffffff +; GFX9-NEXT: s_mul_i32 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm entry: %0 = shl i32 %a, 8 @@ -405,17 +405,17 @@ define amdgpu_kernel void @test_umulhi24_i32_i64(ptr addrspace(1) %out, i32 %a, ; ; GFX9-LABEL: test_umulhi24_i32_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_and_b32 s4, s6, 0xffffff -; GFX9-NEXT: s_and_b32 s5, s7, 0xffffff -; GFX9-NEXT: s_mul_hi_u32 s4, s4, s5 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_and_b32 s0, s2, 0xffffff +; GFX9-NEXT: s_and_b32 s1, s3, 0xffffff +; GFX9-NEXT: s_mul_hi_u32 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm entry: %a.24 = and i32 %a, 16777215 @@ -663,14 +663,14 @@ define amdgpu_kernel void @test_umulhi16_i32(ptr addrspace(1) %out, i32 %a, i32 ; ; GFX9-LABEL: test_umulhi16_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, s6, 0xffff -; GFX9-NEXT: s_and_b32 s1, s7, 0xffff -; GFX9-NEXT: s_mul_i32 s0, s0, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_short_d16_hi v0, v1, s[4:5] +; GFX9-NEXT: s_and_b32 s2, s2, 0xffff +; GFX9-NEXT: s_and_b32 s3, s3, 0xffff +; GFX9-NEXT: s_mul_i32 s2, s2, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_short_d16_hi v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm entry: %a.16 = and i32 %a, 65535 diff --git a/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll b/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll index 79082a54c6a36..c72a7ba3eee83 100644 --- a/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll +++ b/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll @@ -4,17 +4,17 @@ define amdgpu_kernel void @fma_vector_vector_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 { ; GCN-LABEL: fma_vector_vector_scalar_lo: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: ds_read_b32 v2, v0 ; GCN-NEXT: ds_read_b32 v0, v0 offset:4 ; GCN-NEXT: ds_read_u16 v1, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] -; GCN-NEXT: global_store_dword v3, v0, s[4:5] +; GCN-NEXT: global_store_dword v3, v0, s[0:1] ; GCN-NEXT: s_endpgm bb: %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1 @@ -35,17 +35,17 @@ bb: define amdgpu_kernel void @fma_vector_vector_neg_broadcast_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 { ; GCN-LABEL: fma_vector_vector_neg_broadcast_scalar_lo: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: ds_read_b32 v2, v0 ; GCN-NEXT: ds_read_b32 v0, v0 offset:4 ; GCN-NEXT: ds_read_u16 v1, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1] -; GCN-NEXT: global_store_dword v3, v0, s[4:5] +; GCN-NEXT: global_store_dword v3, v0, s[0:1] ; GCN-NEXT: s_endpgm bb: %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1 @@ -67,17 +67,17 @@ bb: define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 { ; GCN-LABEL: fma_vector_vector_neg_scalar_lo: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: ds_read_b32 v2, v0 ; GCN-NEXT: ds_read_b32 v0, v0 offset:4 ; GCN-NEXT: ds_read_u16 v1, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1] -; GCN-NEXT: global_store_dword v3, v0, s[4:5] +; GCN-NEXT: global_store_dword v3, v0, s[0:1] ; GCN-NEXT: s_endpgm bb: %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1 @@ -99,17 +99,17 @@ bb: define amdgpu_kernel void @fma_vector_vector_neg_broadcast_neg_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 { ; GCN-LABEL: fma_vector_vector_neg_broadcast_neg_scalar_lo: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: ds_read_b32 v2, v0 ; GCN-NEXT: ds_read_b32 v0, v0 offset:4 ; GCN-NEXT: ds_read_u16 v1, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] -; GCN-NEXT: global_store_dword v3, v0, s[4:5] +; GCN-NEXT: global_store_dword v3, v0, s[0:1] ; GCN-NEXT: s_endpgm bb: %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1 @@ -132,17 +132,17 @@ bb: define amdgpu_kernel void @fma_vector_vector_scalar_neg_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 { ; GCN-LABEL: fma_vector_vector_scalar_neg_lo: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: ds_read_b32 v2, v0 ; GCN-NEXT: ds_read_b32 v0, v0 offset:4 ; GCN-NEXT: ds_read_u16 v1, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] neg_lo:[0,0,1] -; GCN-NEXT: global_store_dword v3, v0, s[4:5] +; GCN-NEXT: global_store_dword v3, v0, s[0:1] ; GCN-NEXT: s_endpgm bb: %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1 @@ -163,17 +163,17 @@ bb: define amdgpu_kernel void @fma_vector_vector_scalar_neg_hi(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 { ; GCN-LABEL: fma_vector_vector_scalar_neg_hi: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: ds_read_b32 v2, v0 ; GCN-NEXT: ds_read_b32 v0, v0 offset:4 ; GCN-NEXT: ds_read_u16 v1, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] neg_hi:[0,0,1] -; GCN-NEXT: global_store_dword v3, v0, s[4:5] +; GCN-NEXT: global_store_dword v3, v0, s[0:1] ; GCN-NEXT: s_endpgm bb: %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1 @@ -194,16 +194,16 @@ bb: define amdgpu_kernel void @add_vector_neg_bitcast_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 { ; GCN-LABEL: add_vector_neg_bitcast_scalar_lo: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: ds_read_b32 v0, v0 ; GCN-NEXT: ds_read_u16 v1, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_pk_add_u16 v0, v0, v1 op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1] -; GCN-NEXT: global_store_dword v2, v0, s[4:5] +; GCN-NEXT: global_store_dword v2, v0, s[0:1] ; GCN-NEXT: s_endpgm bb: %vec0 = load volatile <2 x i16>, ptr addrspace(3) %lds, align 4 @@ -222,11 +222,11 @@ bb: define amdgpu_kernel void @fma_vector_vector_scalar_lo_neg_scalar_hi(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 { ; GCN-LABEL: fma_vector_vector_scalar_lo_neg_scalar_hi: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: ds_read_b32 v2, v0 ; GCN-NEXT: ds_read_b32 v0, v0 offset:4 ; GCN-NEXT: ds_read_u16 v3, v1 @@ -237,7 +237,7 @@ define amdgpu_kernel void @fma_vector_vector_scalar_lo_neg_scalar_hi(ptr addrspa ; GCN-NEXT: v_xor_b32_e32 v1, 0x8000, v1 ; GCN-NEXT: v_lshl_or_b32 v1, v1, 16, v3 ; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v1 -; GCN-NEXT: global_store_dword v4, v0, s[4:5] +; GCN-NEXT: global_store_dword v4, v0, s[0:1] ; GCN-NEXT: s_endpgm bb: %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1 @@ -261,10 +261,10 @@ bb: define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo_scalar_hi(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 { ; GCN-LABEL: fma_vector_vector_neg_scalar_lo_scalar_hi: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: ds_read_b32 v2, v0 ; GCN-NEXT: ds_read_b32 v0, v0 offset:4 ; GCN-NEXT: ds_read_u16 v3, v1 @@ -273,7 +273,7 @@ define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo_scalar_hi(ptr addrspa ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v3 neg_lo:[0,0,1] neg_hi:[0,0,1] -; GCN-NEXT: global_store_dword v1, v0, s[4:5] +; GCN-NEXT: global_store_dword v1, v0, s[0:1] ; GCN-NEXT: s_endpgm bb: %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1 diff --git a/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll b/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll index 2ce0b9eed02cb..ce92d40cca2b0 100644 --- a/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll +++ b/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll @@ -50,40 +50,40 @@ define amdgpu_kernel void @buffers_dont_alias(ptr addrspace(8) noalias %a, ptr a define amdgpu_kernel void @buffers_from_flat_dont_alias(ptr noalias %a.flat, ptr noalias %b.flat) { ; SDAG-LABEL: buffers_from_flat_dont_alias: ; SDAG: ; %bb.0: -; SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; SDAG-NEXT: s_mov_b32 s3, 0 -; SDAG-NEXT: s_mov_b32 s2, 16 +; SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-NEXT: s_mov_b32 s7, 0 +; SDAG-NEXT: s_mov_b32 s6, 16 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: s_and_b32 s1, s5, 0xffff -; SDAG-NEXT: s_mov_b32 s0, s4 -; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 -; SDAG-NEXT: s_and_b32 s1, s7, 0xffff -; SDAG-NEXT: s_mov_b32 s0, s6 +; SDAG-NEXT: s_and_b32 s5, s1, 0xffff +; SDAG-NEXT: s_mov_b32 s4, s0 +; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; SDAG-NEXT: s_and_b32 s5, s3, 0xffff +; SDAG-NEXT: s_mov_b32 s4, s2 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: v_mul_f32_e32 v0, v0, v0 ; SDAG-NEXT: v_mul_f32_e32 v1, v1, v1 ; SDAG-NEXT: v_mul_f32_e32 v2, v2, v2 ; SDAG-NEXT: v_mul_f32_e32 v3, v3, v3 -; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: buffers_from_flat_dont_alias: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GISEL-NEXT: s_mov_b32 s3, 0 -; GISEL-NEXT: s_mov_b32 s2, 16 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-NEXT: s_mov_b32 s7, 0 +; GISEL-NEXT: s_mov_b32 s6, 16 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: s_and_b32 s1, s5, 0xffff -; GISEL-NEXT: s_mov_b32 s0, s4 -; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 -; GISEL-NEXT: s_and_b32 s1, s7, 0xffff -; GISEL-NEXT: s_mov_b32 s0, s6 +; GISEL-NEXT: s_and_b32 s5, s1, 0xffff +; GISEL-NEXT: s_mov_b32 s4, s0 +; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; GISEL-NEXT: s_and_b32 s5, s3, 0xffff +; GISEL-NEXT: s_mov_b32 s4, s2 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: v_mul_f32_e32 v0, v0, v0 ; GISEL-NEXT: v_mul_f32_e32 v1, v1, v1 ; GISEL-NEXT: v_mul_f32_e32 v2, v2, v2 ; GISEL-NEXT: v_mul_f32_e32 v3, v3, v3 -; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GISEL-NEXT: s_endpgm %a = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %a.flat, i16 0, i32 16, i32 0) %b = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %b.flat, i16 0, i32 16, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/rotl.ll b/llvm/test/CodeGen/AMDGPU/rotl.ll index fcccd2da07f76..a87973d93ac77 100644 --- a/llvm/test/CodeGen/AMDGPU/rotl.ll +++ b/llvm/test/CodeGen/AMDGPU/rotl.ll @@ -47,12 +47,12 @@ define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX10-LABEL: rotl_i32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_sub_i32 s0, 32, s7 -; GFX10-NEXT: v_alignbit_b32 v1, s6, s6, s0 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: s_sub_i32 s3, 32, s3 +; GFX10-NEXT: v_alignbit_b32 v1, s2, s2, s3 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: rotl_i32: diff --git a/llvm/test/CodeGen/AMDGPU/rotr.ll b/llvm/test/CodeGen/AMDGPU/rotr.ll index 214894092a8b0..058ee589bc4b0 100644 --- a/llvm/test/CodeGen/AMDGPU/rotr.ll +++ b/llvm/test/CodeGen/AMDGPU/rotr.ll @@ -43,11 +43,11 @@ define amdgpu_kernel void @rotr_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX10-LABEL: rotr_i32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v1, s6, s6, s7 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: v_alignbit_b32 v1, s2, s2, s3 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: rotr_i32: diff --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll index acdcb631dccbd..b81af3eb838f1 100644 --- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll @@ -8,15 +8,15 @@ define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 { ; GFX9-LABEL: s_shl_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: v_pk_lshlrev_b16 v0, s7, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: v_pk_lshlrev_b16 v0, s3, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: s_shl_v2i16: @@ -59,14 +59,14 @@ define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 ; ; GFX10-LABEL: s_shl_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_lshlrev_b16 v0, s7, s6 -; GFX10-NEXT: s_mov_b32 s0, s4 -; GFX10-NEXT: s_mov_b32 s1, s5 -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-NEXT: v_pk_lshlrev_b16 v0, s3, s2 +; GFX10-NEXT: s_mov_b32 s4, s0 +; GFX10-NEXT: s_mov_b32 s5, s1 +; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_shl_v2i16: diff --git a/llvm/test/CodeGen/AMDGPU/sub.ll b/llvm/test/CodeGen/AMDGPU/sub.ll index 3ae982089228d..ded308ae4f230 100644 --- a/llvm/test/CodeGen/AMDGPU/sub.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.ll @@ -33,12 +33,12 @@ define amdgpu_kernel void @s_sub_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; ; GFX9-LABEL: s_sub_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sub_i32 s0, s6, s7 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: s_sub_i32 s2, s2, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: s_sub_i32: diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll index bfeab97d81dbe..6ec213a06999b 100644 --- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll @@ -214,15 +214,15 @@ define amdgpu_kernel void @s_test_sub_self_v2i16(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @s_test_sub_v2i16_kernarg(ptr addrspace(1) %out, <2 x i16> %a, <2 x i16> %b) #1 { ; GFX9-LABEL: s_test_sub_v2i16_kernarg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: v_pk_sub_i16 v0, s6, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: v_pk_sub_i16 v0, s2, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: s_test_sub_v2i16_kernarg: @@ -246,14 +246,14 @@ define amdgpu_kernel void @s_test_sub_v2i16_kernarg(ptr addrspace(1) %out, <2 x ; ; GFX10-LABEL: s_test_sub_v2i16_kernarg: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_sub_i16 v0, s6, s7 -; GFX10-NEXT: s_mov_b32 s0, s4 -; GFX10-NEXT: s_mov_b32 s1, s5 -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-NEXT: v_pk_sub_i16 v0, s2, s3 +; GFX10-NEXT: s_mov_b32 s4, s0 +; GFX10-NEXT: s_mov_b32 s5, s1 +; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_sub_v2i16_kernarg: diff --git a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll index 65eb1cee42350..fc6df735c05b0 100644 --- a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll +++ b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll @@ -135,13 +135,13 @@ define amdgpu_kernel void @v_cnd_nan(ptr addrspace(1) %out, i32 %c, float %f) #0 ; ; GFX10-LABEL: v_cnd_nan: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_cmp_eq_u32 s6, 0 -; GFX10-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, -1, s7, s[0:1] -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: s_cmp_eq_u32 s2, 0 +; GFX10-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, -1, s3, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_cnd_nan: diff --git a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll index e3185e189157b..89fef7eead839 100644 --- a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll +++ b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll @@ -104,15 +104,15 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg ; ; SDAG-GFX9-LABEL: basic_smax_smin_sgpr: ; SDAG-GFX9: ; %bb.0: -; SDAG-GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; SDAG-GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX9-NEXT: v_mov_b32_e32 v1, 0xff ; SDAG-GFX9-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX9-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX9-NEXT: v_med3_i16 v2, s6, 0, v1 -; SDAG-GFX9-NEXT: v_med3_i16 v1, s7, 0, v1 +; SDAG-GFX9-NEXT: v_med3_i16 v2, s2, 0, v1 +; SDAG-GFX9-NEXT: v_med3_i16 v1, s3, 0, v1 ; SDAG-GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SDAG-GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 -; SDAG-GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; SDAG-GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX9-NEXT: s_endpgm ; ; SDAG-GFX11-LABEL: basic_smax_smin_sgpr: @@ -156,22 +156,22 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg ; ; GISEL-GFX9-LABEL: basic_smax_smin_sgpr: ; GISEL-GFX9: ; %bb.0: -; GISEL-GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GISEL-GFX9-NEXT: s_sext_i32_i16 s0, 0 -; GISEL-GFX9-NEXT: s_sext_i32_i16 s1, 0xff +; GISEL-GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-GFX9-NEXT: s_sext_i32_i16 s4, 0 +; GISEL-GFX9-NEXT: s_sext_i32_i16 s5, 0xff ; GISEL-GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX9-NEXT: s_sext_i32_i16 s2, s6 -; GISEL-GFX9-NEXT: s_sext_i32_i16 s3, s7 -; GISEL-GFX9-NEXT: s_max_i32 s2, s2, s0 -; GISEL-GFX9-NEXT: s_max_i32 s0, s3, s0 ; GISEL-GFX9-NEXT: s_sext_i32_i16 s2, s2 -; GISEL-GFX9-NEXT: s_sext_i32_i16 s0, s0 -; GISEL-GFX9-NEXT: s_min_i32 s2, s2, s1 -; GISEL-GFX9-NEXT: s_min_i32 s0, s0, s1 -; GISEL-GFX9-NEXT: s_pack_ll_b32_b16 s0, s2, s0 -; GISEL-GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX9-NEXT: global_store_dword v1, v0, s[4:5] +; GISEL-GFX9-NEXT: s_sext_i32_i16 s3, s3 +; GISEL-GFX9-NEXT: s_max_i32 s2, s2, s4 +; GISEL-GFX9-NEXT: s_max_i32 s3, s3, s4 +; GISEL-GFX9-NEXT: s_sext_i32_i16 s2, s2 +; GISEL-GFX9-NEXT: s_sext_i32_i16 s3, s3 +; GISEL-GFX9-NEXT: s_min_i32 s2, s2, s5 +; GISEL-GFX9-NEXT: s_min_i32 s3, s3, s5 +; GISEL-GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s3 +; GISEL-GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX9-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: basic_smax_smin_sgpr: diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll index e12a4beb5dbe5..901e88a4c6aca 100644 --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -1344,40 +1344,40 @@ exit: define amdgpu_kernel void @fdiv_f32(ptr addrspace(1) %out, float %a, float %b) #0 { ; GFX1032-LABEL: fdiv_f32: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_div_scale_f32 v0, s0, s7, s7, s6 +; GFX1032-NEXT: v_div_scale_f32 v0, s4, s3, s3, s2 ; GFX1032-NEXT: v_rcp_f32_e32 v1, v0 ; GFX1032-NEXT: v_fma_f32 v2, -v0, v1, 1.0 ; GFX1032-NEXT: v_fmac_f32_e32 v1, v2, v1 -; GFX1032-NEXT: v_div_scale_f32 v2, vcc_lo, s6, s7, s6 +; GFX1032-NEXT: v_div_scale_f32 v2, vcc_lo, s2, s3, s2 ; GFX1032-NEXT: v_mul_f32_e32 v3, v2, v1 ; GFX1032-NEXT: v_fma_f32 v4, -v0, v3, v2 ; GFX1032-NEXT: v_fmac_f32_e32 v3, v4, v1 ; GFX1032-NEXT: v_fma_f32 v0, -v0, v3, v2 ; GFX1032-NEXT: v_div_fmas_f32 v0, v0, v1, v3 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: v_div_fixup_f32 v0, v0, s7, s6 -; GFX1032-NEXT: global_store_dword v1, v0, s[4:5] +; GFX1032-NEXT: v_div_fixup_f32 v0, v0, s3, s2 +; GFX1032-NEXT: global_store_dword v1, v0, s[0:1] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: fdiv_f32: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_div_scale_f32 v0, s[0:1], s7, s7, s6 +; GFX1064-NEXT: v_div_scale_f32 v0, s[4:5], s3, s3, s2 ; GFX1064-NEXT: v_rcp_f32_e32 v1, v0 ; GFX1064-NEXT: v_fma_f32 v2, -v0, v1, 1.0 ; GFX1064-NEXT: v_fmac_f32_e32 v1, v2, v1 -; GFX1064-NEXT: v_div_scale_f32 v2, vcc, s6, s7, s6 +; GFX1064-NEXT: v_div_scale_f32 v2, vcc, s2, s3, s2 ; GFX1064-NEXT: v_mul_f32_e32 v3, v2, v1 ; GFX1064-NEXT: v_fma_f32 v4, -v0, v3, v2 ; GFX1064-NEXT: v_fmac_f32_e32 v3, v4, v1 ; GFX1064-NEXT: v_fma_f32 v0, -v0, v3, v2 ; GFX1064-NEXT: v_div_fmas_f32 v0, v0, v1, v3 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: v_div_fixup_f32 v0, v0, s7, s6 -; GFX1064-NEXT: global_store_dword v1, v0, s[4:5] +; GFX1064-NEXT: v_div_fixup_f32 v0, v0, s3, s2 +; GFX1064-NEXT: global_store_dword v1, v0, s[0:1] ; GFX1064-NEXT: s_endpgm entry: %fdiv = fdiv float %a, %b @@ -2138,23 +2138,23 @@ main_body: define amdgpu_kernel void @test_intr_fcmp_i64(ptr addrspace(1) %out, float %src, float %a) { ; GFX1032-LABEL: test_intr_fcmp_i64: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_cmp_eq_f32_e64 s0, s6, |s7| -; GFX1032-NEXT: v_mov_b32_e32 v0, s0 -; GFX1032-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] +; GFX1032-NEXT: v_cmp_eq_f32_e64 s2, s2, |s3| +; GFX1032-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_intr_fcmp_i64: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_cmp_eq_f32_e64 s[0:1], s6, |s7| -; GFX1064-NEXT: v_mov_b32_e32 v0, s0 -; GFX1064-NEXT: v_mov_b32_e32 v1, s1 -; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX1064-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, |s3| +; GFX1064-NEXT: v_mov_b32_e32 v0, s2 +; GFX1064-NEXT: v_mov_b32_e32 v1, s3 +; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1064-NEXT: s_endpgm %temp = call float @llvm.fabs.f32(float %a) %result = call i64 @llvm.amdgcn.fcmp.i64.f32(float %src, float %temp, i32 1) @@ -2195,22 +2195,22 @@ define amdgpu_kernel void @test_intr_icmp_i64(ptr addrspace(1) %out, i32 %src) { define amdgpu_kernel void @test_intr_fcmp_i32(ptr addrspace(1) %out, float %src, float %a) { ; GFX1032-LABEL: test_intr_fcmp_i32: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_cmp_eq_f32_e64 s0, s6, |s7| -; GFX1032-NEXT: v_mov_b32_e32 v1, s0 -; GFX1032-NEXT: global_store_dword v0, v1, s[4:5] +; GFX1032-NEXT: v_cmp_eq_f32_e64 s2, s2, |s3| +; GFX1032-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-NEXT: global_store_dword v0, v1, s[0:1] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_intr_fcmp_i32: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_cmp_eq_f32_e64 s[0:1], s6, |s7| -; GFX1064-NEXT: v_mov_b32_e32 v1, s0 -; GFX1064-NEXT: global_store_dword v0, v1, s[4:5] +; GFX1064-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, |s3| +; GFX1064-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-NEXT: global_store_dword v0, v1, s[0:1] ; GFX1064-NEXT: s_endpgm %temp = call float @llvm.fabs.f32(float %a) %result = call i32 @llvm.amdgcn.fcmp.i32.f32(float %src, float %temp, i32 1) From 99dc65d310f62f9c65b135c5f0329b19f2e1de53 Mon Sep 17 00:00:00 2001 From: Christudasan Devadasan Date: Mon, 1 Jul 2024 06:19:49 +0000 Subject: [PATCH 4/8] Removed unwanted helper function hasXnackReplay. --- llvm/lib/Target/AMDGPU/GCNSubtarget.h | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 6ccee799a671e..def89c785b855 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -1012,7 +1012,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool hasLDSFPAtomicAddF32() const { return GFX8Insts; } bool hasLDSFPAtomicAddF64() const { return GFX90AInsts; } - bool hasXnackReplay() const { return GFX8Insts; } /// \returns true if the subtarget has the v_permlanex16_b32 instruction. bool hasPermLaneX16() const { return getGeneration() >= GFX10; } From 7766fd00297354f7070dd7378ed65a0f24865347 Mon Sep 17 00:00:00 2001 From: Christudasan Devadasan Date: Wed, 3 Jul 2024 04:17:25 +0000 Subject: [PATCH 5/8] Used byte width and simplified some more code. --- .../Target/AMDGPU/SILoadStoreOptimizer.cpp | 28 +- .../AMDGPU/GlobalISel/fp-atomics-gfx940.ll | 12 +- .../GlobalISel/llvm.amdgcn.update.dpp.ll | 28 +- .../test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll | 152 +++---- .../test/CodeGen/AMDGPU/GlobalISel/udivrem.ll | 84 ++-- llvm/test/CodeGen/AMDGPU/add.v2i16.ll | 14 +- .../AMDGPU/amdgpu-codegenprepare-idiv.ll | 408 +++++++++--------- llvm/test/CodeGen/AMDGPU/build_vector.ll | 12 +- llvm/test/CodeGen/AMDGPU/cluster_stores.ll | 88 ++-- .../CodeGen/AMDGPU/combine-cond-add-sub.ll | 20 +- llvm/test/CodeGen/AMDGPU/ctlz.ll | 180 ++++---- llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll | 158 +++---- llvm/test/CodeGen/AMDGPU/cttz.ll | 184 ++++---- llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll | 126 +++--- llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll | 262 ++++++----- .../AMDGPU/divergence-driven-buildvector.ll | 32 +- llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll | 20 +- llvm/test/CodeGen/AMDGPU/fdiv.ll | 86 ++-- llvm/test/CodeGen/AMDGPU/flat_atomics.ll | 20 +- .../CodeGen/AMDGPU/flat_atomics_i32_system.ll | 56 +-- llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll | 180 ++++---- llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll | 12 +- llvm/test/CodeGen/AMDGPU/fshl.ll | 42 +- llvm/test/CodeGen/AMDGPU/fshr.ll | 14 +- llvm/test/CodeGen/AMDGPU/global_atomics.ll | 16 +- .../AMDGPU/global_atomics_i32_system.ll | 96 ++--- .../insert_waitcnt_for_precise_memory.ll | 160 +++---- .../CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll | 14 +- .../CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll | 32 +- .../CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll | 24 +- .../CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll | 16 +- .../CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll | 16 +- .../CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll | 16 +- .../CodeGen/AMDGPU/llvm.amdgcn.permlane.ll | 112 +++-- .../AMDGPU/llvm.amdgcn.sched.group.barrier.ll | 244 +++++------ llvm/test/CodeGen/AMDGPU/llvm.exp.ll | 48 +-- llvm/test/CodeGen/AMDGPU/llvm.exp10.ll | 48 +-- llvm/test/CodeGen/AMDGPU/llvm.exp2.ll | 12 +- llvm/test/CodeGen/AMDGPU/llvm.log2.ll | 12 +- llvm/test/CodeGen/AMDGPU/llvm.round.ll | 76 ++-- llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll | 14 +- llvm/test/CodeGen/AMDGPU/madak.ll | 104 ++--- llvm/test/CodeGen/AMDGPU/memory_clause.ll | 18 +- .../CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll | 160 +++---- llvm/test/CodeGen/AMDGPU/mul_int24.ll | 40 +- llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll | 52 +-- llvm/test/CodeGen/AMDGPU/packed-op-sel.ll | 72 ++-- .../AMDGPU/ptr-buffer-alias-scheduling.ll | 36 +- llvm/test/CodeGen/AMDGPU/rotl.ll | 8 +- llvm/test/CodeGen/AMDGPU/rotr.ll | 6 +- llvm/test/CodeGen/AMDGPU/shl.v2i16.ll | 30 +- llvm/test/CodeGen/AMDGPU/sub.ll | 8 +- llvm/test/CodeGen/AMDGPU/sub.v2i16.ll | 30 +- llvm/test/CodeGen/AMDGPU/v_cndmask.ll | 10 +- llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll | 34 +- llvm/test/CodeGen/AMDGPU/wave32.ll | 54 +-- 56 files changed, 1920 insertions(+), 1916 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index 7bdc494ff545d..ea8ad39b90257 100644 --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -1234,11 +1234,8 @@ void SILoadStoreOptimizer::copyToDestRegs( // The constrained sload instructions in S_LOAD_IMM class will have // `early-clobber` flag in the dst operand. Remove the flag before using the // MOs in copies. - if (Dest0->isEarlyClobber()) - Dest0->setIsEarlyClobber(false); - - if (Dest1->isEarlyClobber()) - Dest1->setIsEarlyClobber(false); + Dest0->setIsEarlyClobber(false); + Dest1->setIsEarlyClobber(false); BuildMI(*MBB, InsertBefore, DL, CopyDesc) .add(*Dest0) // Copy to same destination including flags and sub reg. @@ -1729,24 +1726,23 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, // If XNACK is enabled, use the constrained opcodes when the first load is // under-aligned. const MachineMemOperand *MMO = *CI.I->memoperands_begin(); - auto NeedsConstrainedOpc = [&MMO, Width](const GCNSubtarget &ST) { - return ST.isXNACKEnabled() && MMO->getAlign().value() < Width; - }; + bool NeedsConstrainedOpc = + STM->isXNACKEnabled() && MMO->getAlign().value() < (Width << 2); switch (Width) { default: return 0; case 2: - return NeedsConstrainedOpc(*STM) ? AMDGPU::S_LOAD_DWORDX2_IMM_ec - : AMDGPU::S_LOAD_DWORDX2_IMM; + return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX2_IMM_ec + : AMDGPU::S_LOAD_DWORDX2_IMM; case 3: - return NeedsConstrainedOpc(*STM) ? AMDGPU::S_LOAD_DWORDX3_IMM_ec - : AMDGPU::S_LOAD_DWORDX3_IMM; + return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX3_IMM_ec + : AMDGPU::S_LOAD_DWORDX3_IMM; case 4: - return NeedsConstrainedOpc(*STM) ? AMDGPU::S_LOAD_DWORDX4_IMM_ec - : AMDGPU::S_LOAD_DWORDX4_IMM; + return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX4_IMM_ec + : AMDGPU::S_LOAD_DWORDX4_IMM; case 8: - return NeedsConstrainedOpc(*STM) ? AMDGPU::S_LOAD_DWORDX8_IMM_ec - : AMDGPU::S_LOAD_DWORDX8_IMM; + return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX8_IMM_ec + : AMDGPU::S_LOAD_DWORDX8_IMM; } } case GLOBAL_LOAD: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll index fff341b07881b..a018ea5bf18f1 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll @@ -159,10 +159,10 @@ define <2 x i16> @global_atomic_fadd_v2bf16_rtn(ptr addrspace(1) %ptr, <2 x i16> define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, <2 x half> %data) { ; GFX940-LABEL: local_atomic_fadd_v2f16_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s0 -; GFX940-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-NEXT: v_mov_b32_e32 v1, s3 ; GFX940-NEXT: ds_pk_add_f16 v0, v1 ; GFX940-NEXT: s_endpgm %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0) @@ -183,10 +183,10 @@ define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half> define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, <2 x i16> %data) { ; GFX940-LABEL: local_atomic_fadd_v2bf16_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s1 -; GFX940-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NEXT: v_mov_b32_e32 v0, s3 +; GFX940-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: ds_pk_add_bf16 v1, v0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll index 727184a36c006..1092bb4dc834a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll @@ -19,13 +19,13 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) { ; ; GFX10-LABEL: dpp_test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: dpp_test: @@ -174,16 +174,16 @@ define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32> ; ; GFX10-LABEL: update_dppv2i32_test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] -; GFX10-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] +; GFX10-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-NEXT: v_mov_b32_e32 v3, s7 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 -; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: update_dppv2i32_test: @@ -229,16 +229,16 @@ define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x floa ; ; GFX10-LABEL: update_dppv2f32_test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] -; GFX10-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] +; GFX10-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-NEXT: v_mov_b32_e32 v3, s7 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 -; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: update_dppv2f32_test: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll index b666f45521661..8a2274cbfbf62 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll @@ -692,121 +692,121 @@ define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX9-LABEL: sdivrem_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s8, s6, 31 -; GFX9-NEXT: s_add_i32 s6, s6, s8 -; GFX9-NEXT: s_xor_b32 s6, s6, s8 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX9-NEXT: s_ashr_i32 s9, s7, 31 -; GFX9-NEXT: s_add_i32 s7, s7, s9 -; GFX9-NEXT: s_xor_b32 s7, s7, s9 +; GFX9-NEXT: s_ashr_i32 s0, s14, 31 +; GFX9-NEXT: s_add_i32 s1, s14, s0 +; GFX9-NEXT: s_xor_b32 s1, s1, s0 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 +; GFX9-NEXT: s_ashr_i32 s2, s15, 31 +; GFX9-NEXT: s_add_i32 s3, s15, s2 +; GFX9-NEXT: s_xor_b32 s3, s3, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 -; GFX9-NEXT: s_sub_i32 s12, 0, s6 -; GFX9-NEXT: s_ashr_i32 s10, s4, 31 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3 +; GFX9-NEXT: s_sub_i32 s6, 0, s1 +; GFX9-NEXT: s_ashr_i32 s4, s12, 31 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: s_add_i32 s4, s4, s10 -; GFX9-NEXT: s_xor_b32 s4, s4, s10 -; GFX9-NEXT: v_mul_lo_u32 v2, s12, v0 +; GFX9-NEXT: s_sub_i32 s7, 0, s3 +; GFX9-NEXT: s_ashr_i32 s5, s13, 31 +; GFX9-NEXT: v_mul_lo_u32 v2, s6, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: s_sub_i32 s12, 0, s7 +; GFX9-NEXT: s_add_i32 s6, s12, s4 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX9-NEXT: s_ashr_i32 s11, s5, 31 -; GFX9-NEXT: v_mul_lo_u32 v3, s12, v1 -; GFX9-NEXT: s_add_i32 s5, s5, s11 +; GFX9-NEXT: s_xor_b32 s6, s6, s4 +; GFX9-NEXT: v_mul_lo_u32 v3, s7, v1 +; GFX9-NEXT: s_add_i32 s7, s13, s5 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, s6, v0 ; GFX9-NEXT: v_mul_hi_u32 v2, v1, v3 -; GFX9-NEXT: s_xor_b32 s5, s5, s11 -; GFX9-NEXT: v_mul_lo_u32 v3, v0, s6 +; GFX9-NEXT: s_xor_b32 s7, s7, s5 +; GFX9-NEXT: s_xor_b32 s0, s4, s0 +; GFX9-NEXT: v_mul_lo_u32 v3, v0, s1 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 ; GFX9-NEXT: v_add_u32_e32 v2, 1, v0 -; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 -; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 +; GFX9-NEXT: v_mul_hi_u32 v1, s7, v1 +; GFX9-NEXT: v_sub_u32_e32 v3, s6, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s1, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_subrev_u32_e32 v2, s6, v3 +; GFX9-NEXT: v_subrev_u32_e32 v2, s1, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s1, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s6, v2 +; GFX9-NEXT: v_subrev_u32_e32 v3, s1, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, v1, s7 +; GFX9-NEXT: v_mul_lo_u32 v3, v1, s3 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 -; GFX9-NEXT: s_xor_b32 s4, s10, s8 -; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 -; GFX9-NEXT: v_sub_u32_e32 v3, s5, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 +; GFX9-NEXT: v_xor_b32_e32 v0, s0, v0 +; GFX9-NEXT: v_subrev_u32_e32 v0, s0, v0 +; GFX9-NEXT: v_sub_u32_e32 v3, s7, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s7, v3 +; GFX9-NEXT: v_subrev_u32_e32 v4, s3, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 -; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s7, v3 -; GFX9-NEXT: s_xor_b32 s4, s11, s9 +; GFX9-NEXT: v_subrev_u32_e32 v4, s3, v3 +; GFX9-NEXT: s_xor_b32 s0, s5, s2 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX9-NEXT: v_xor_b32_e32 v1, s4, v1 -; GFX9-NEXT: v_xor_b32_e32 v2, s10, v2 -; GFX9-NEXT: v_subrev_u32_e32 v1, s4, v1 -; GFX9-NEXT: v_xor_b32_e32 v3, s11, v3 +; GFX9-NEXT: v_xor_b32_e32 v1, s0, v1 +; GFX9-NEXT: v_xor_b32_e32 v2, s4, v2 +; GFX9-NEXT: v_subrev_u32_e32 v1, s0, v1 +; GFX9-NEXT: v_xor_b32_e32 v3, s5, v3 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_subrev_u32_e32 v2, s10, v2 -; GFX9-NEXT: v_subrev_u32_e32 v3, s11, v3 -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] -; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3] +; GFX9-NEXT: v_subrev_u32_e32 v2, s4, v2 +; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v3 +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9] +; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: sdivrem_v2i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_ashr_i32 s1, s10, 31 -; GFX10-NEXT: s_ashr_i32 s2, s11, 31 -; GFX10-NEXT: s_add_i32 s0, s10, s1 -; GFX10-NEXT: s_add_i32 s3, s11, s2 -; GFX10-NEXT: s_xor_b32 s10, s0, s1 +; GFX10-NEXT: s_ashr_i32 s1, s14, 31 +; GFX10-NEXT: s_ashr_i32 s2, s15, 31 +; GFX10-NEXT: s_add_i32 s0, s14, s1 +; GFX10-NEXT: s_add_i32 s3, s15, s2 +; GFX10-NEXT: s_xor_b32 s4, s0, s1 ; GFX10-NEXT: s_xor_b32 s3, s3, s2 -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s10 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s4 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GFX10-NEXT: s_sub_i32 s0, 0, s10 -; GFX10-NEXT: s_sub_i32 s11, 0, s3 -; GFX10-NEXT: s_ashr_i32 s12, s9, 31 +; GFX10-NEXT: s_sub_i32 s0, 0, s4 +; GFX10-NEXT: s_sub_i32 s5, 0, s3 +; GFX10-NEXT: s_ashr_i32 s6, s13, 31 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX10-NEXT: s_add_i32 s7, s13, s6 +; GFX10-NEXT: s_xor_b32 s7, s7, s6 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_lo_u32 v2, s0, v0 -; GFX10-NEXT: v_mul_lo_u32 v3, s11, v1 -; GFX10-NEXT: s_ashr_i32 s11, s8, 31 -; GFX10-NEXT: s_add_i32 s0, s8, s11 -; GFX10-NEXT: s_add_i32 s8, s9, s12 -; GFX10-NEXT: s_xor_b32 s0, s0, s11 -; GFX10-NEXT: s_xor_b32 s8, s8, s12 +; GFX10-NEXT: v_mul_lo_u32 v3, s5, v1 +; GFX10-NEXT: s_ashr_i32 s5, s12, 31 +; GFX10-NEXT: s_add_i32 s0, s12, s5 +; GFX10-NEXT: s_xor_b32 s1, s5, s1 +; GFX10-NEXT: s_xor_b32 s0, s0, s5 ; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX10-NEXT: s_xor_b32 s1, s11, s1 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 ; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX10-NEXT: v_mul_hi_u32 v1, s8, v1 -; GFX10-NEXT: v_mul_lo_u32 v2, v0, s10 +; GFX10-NEXT: v_mul_hi_u32 v1, s7, v1 +; GFX10-NEXT: v_mul_lo_u32 v2, v0, s4 ; GFX10-NEXT: v_mul_lo_u32 v3, v1, s3 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 ; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, s0, v2 -; GFX10-NEXT: v_sub_nc_u32_e32 v3, s8, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s10, v2 +; GFX10-NEXT: v_sub_nc_u32_e32 v3, s7, v3 +; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s4, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s3, v3 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s10, v2 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s4, v2 ; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s3, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 @@ -814,26 +814,26 @@ define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s10, v2 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s4, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s3, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s10, v2 +; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s4, v2 ; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s3, v3 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo -; GFX10-NEXT: s_xor_b32 s0, s12, s2 +; GFX10-NEXT: s_xor_b32 s0, s6, s2 ; GFX10-NEXT: v_xor_b32_e32 v0, s1, v0 ; GFX10-NEXT: v_xor_b32_e32 v1, s0, v1 -; GFX10-NEXT: v_xor_b32_e32 v2, s11, v2 -; GFX10-NEXT: v_xor_b32_e32 v3, s12, v3 +; GFX10-NEXT: v_xor_b32_e32 v2, s5, v2 +; GFX10-NEXT: v_xor_b32_e32 v3, s6, v3 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_subrev_nc_u32_e32 v0, s1, v0 ; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s0, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s11, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s12, v3 -; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] -; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7] +; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s5, v2 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s6, v3 +; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9] +; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11] ; GFX10-NEXT: s_endpgm %div = sdiv <2 x i32> %x, %y store <2 x i32> %div, ptr addrspace(1) %out0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll index a58397eccaba7..62d8b7d6f045c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll @@ -576,12 +576,12 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX9-LABEL: udivrem_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s10 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s11 -; GFX9-NEXT: s_sub_i32 s0, 0, s10 -; GFX9-NEXT: s_sub_i32 s1, 0, s11 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s14 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s15 +; GFX9-NEXT: s_sub_i32 s0, 0, s14 +; GFX9-NEXT: s_sub_i32 s1, 0, s15 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -593,47 +593,47 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v0, s8, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, s12, v0 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 -; GFX9-NEXT: v_mul_hi_u32 v1, s9, v1 -; GFX9-NEXT: v_mul_lo_u32 v2, v0, s10 +; GFX9-NEXT: v_mul_hi_u32 v1, s13, v1 +; GFX9-NEXT: v_mul_lo_u32 v2, v0, s14 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 -; GFX9-NEXT: v_mul_lo_u32 v3, v1, s11 +; GFX9-NEXT: v_mul_lo_u32 v3, v1, s15 ; GFX9-NEXT: v_add_u32_e32 v5, 1, v1 -; GFX9-NEXT: v_sub_u32_e32 v2, s8, v2 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 -; GFX9-NEXT: v_sub_u32_e32 v3, s9, v3 +; GFX9-NEXT: v_sub_u32_e32 v2, s12, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s14, v2 +; GFX9-NEXT: v_sub_u32_e32 v3, s13, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s10, v2 -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v3 +; GFX9-NEXT: v_subrev_u32_e32 v4, s14, v2 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1] -; GFX9-NEXT: v_subrev_u32_e32 v5, s11, v3 +; GFX9-NEXT: v_subrev_u32_e32 v5, s15, v3 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s14, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s10, v2 +; GFX9-NEXT: v_subrev_u32_e32 v4, s14, v2 ; GFX9-NEXT: v_add_u32_e32 v5, 1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 -; GFX9-NEXT: v_subrev_u32_e32 v4, s11, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s15, v3 +; GFX9-NEXT: v_subrev_u32_e32 v4, s15, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] -; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9] +; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: udivrem_v2i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s10 -; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s11 -; GFX10-NEXT: s_sub_i32 s0, 0, s10 -; GFX10-NEXT: s_sub_i32 s1, 0, s11 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s14 +; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s15 +; GFX10-NEXT: s_sub_i32 s0, 0, s14 +; GFX10-NEXT: s_sub_i32 s1, 0, s15 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -646,34 +646,34 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 -; GFX10-NEXT: v_mul_hi_u32 v0, s8, v0 -; GFX10-NEXT: v_mul_hi_u32 v1, s9, v1 -; GFX10-NEXT: v_mul_lo_u32 v2, v0, s10 -; GFX10-NEXT: v_mul_lo_u32 v3, v1, s11 +; GFX10-NEXT: v_mul_hi_u32 v0, s12, v0 +; GFX10-NEXT: v_mul_hi_u32 v1, s13, v1 +; GFX10-NEXT: v_mul_lo_u32 v2, v0, s14 +; GFX10-NEXT: v_mul_lo_u32 v3, v1, s15 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 ; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, s8, v2 -; GFX10-NEXT: v_sub_nc_u32_e32 v3, s9, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s10, v2 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s11, v3 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s10, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s11, v3 +; GFX10-NEXT: v_sub_nc_u32_e32 v2, s12, v2 +; GFX10-NEXT: v_sub_nc_u32_e32 v3, s13, v3 +; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s14, v2 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s15, v3 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s14, v2 +; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s15, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s10, v2 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s11, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s10, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s11, v3 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s14, v2 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s15, v3 +; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s14, v2 +; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s15, v3 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo -; GFX10-NEXT: global_store_dwordx2 v8, v[0:1], s[4:5] -; GFX10-NEXT: global_store_dwordx2 v8, v[2:3], s[6:7] +; GFX10-NEXT: global_store_dwordx2 v8, v[0:1], s[8:9] +; GFX10-NEXT: global_store_dwordx2 v8, v[2:3], s[10:11] ; GFX10-NEXT: s_endpgm %div = udiv <2 x i32> %x, %y store <2 x i32> %div, ptr addrspace(1) %out0 diff --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll index 6f67ce4de9ce5..611a7b566070c 100644 --- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll @@ -243,21 +243,21 @@ define amdgpu_kernel void @s_test_add_v2i16_kernarg(ptr addrspace(1) %out, <2 x ; ; GFX9-LABEL: s_test_add_v2i16_kernarg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_pk_add_u16 v1, s2, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_pk_add_u16 v1, s6, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_test_add_v2i16_kernarg: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_add_u16 v1, s2, s3 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: v_pk_add_u16 v1, s6, s7 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_add_v2i16_kernarg: diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll index 8144fb7a3b646..559871d162e13 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -72,31 +72,31 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX9-LABEL: udiv_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX9-NEXT: s_sub_i32 s4, 0, s3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX9-NEXT: s_sub_i32 s0, 0, s7 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s5, v0 -; GFX9-NEXT: s_mul_i32 s4, s4, s5 -; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4 -; GFX9-NEXT: s_add_i32 s5, s5, s4 -; GFX9-NEXT: s_mul_hi_u32 s4, s2, s5 -; GFX9-NEXT: s_mul_i32 s5, s4, s3 -; GFX9-NEXT: s_sub_i32 s2, s2, s5 -; GFX9-NEXT: s_add_i32 s6, s4, 1 -; GFX9-NEXT: s_sub_i32 s5, s2, s3 -; GFX9-NEXT: s_cmp_ge_u32 s2, s3 -; GFX9-NEXT: s_cselect_b32 s4, s6, s4 -; GFX9-NEXT: s_cselect_b32 s2, s5, s2 -; GFX9-NEXT: s_add_i32 s5, s4, 1 -; GFX9-NEXT: s_cmp_ge_u32 s2, s3 -; GFX9-NEXT: s_cselect_b32 s2, s5, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: v_readfirstlane_b32 s1, v0 +; GFX9-NEXT: s_mul_i32 s0, s0, s1 +; GFX9-NEXT: s_mul_hi_u32 s0, s1, s0 +; GFX9-NEXT: s_add_i32 s1, s1, s0 +; GFX9-NEXT: s_mul_hi_u32 s0, s6, s1 +; GFX9-NEXT: s_mul_i32 s1, s0, s7 +; GFX9-NEXT: s_sub_i32 s1, s6, s1 +; GFX9-NEXT: s_add_i32 s2, s0, 1 +; GFX9-NEXT: s_sub_i32 s3, s1, s7 +; GFX9-NEXT: s_cmp_ge_u32 s1, s7 +; GFX9-NEXT: s_cselect_b32 s0, s2, s0 +; GFX9-NEXT: s_cselect_b32 s1, s3, s1 +; GFX9-NEXT: s_add_i32 s2, s0, 1 +; GFX9-NEXT: s_cmp_ge_u32 s1, s7 +; GFX9-NEXT: s_cselect_b32 s0, s2, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-NEXT: s_endpgm %r = udiv i32 %x, %y store i32 %r, ptr addrspace(1) %out @@ -167,29 +167,29 @@ define amdgpu_kernel void @urem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX9-LABEL: urem_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX9-NEXT: s_sub_i32 s4, 0, s3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX9-NEXT: s_sub_i32 s0, 0, s7 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s5, v0 -; GFX9-NEXT: s_mul_i32 s4, s4, s5 -; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4 -; GFX9-NEXT: s_add_i32 s5, s5, s4 -; GFX9-NEXT: s_mul_hi_u32 s4, s2, s5 -; GFX9-NEXT: s_mul_i32 s4, s4, s3 -; GFX9-NEXT: s_sub_i32 s2, s2, s4 -; GFX9-NEXT: s_sub_i32 s4, s2, s3 -; GFX9-NEXT: s_cmp_ge_u32 s2, s3 -; GFX9-NEXT: s_cselect_b32 s2, s4, s2 -; GFX9-NEXT: s_sub_i32 s4, s2, s3 -; GFX9-NEXT: s_cmp_ge_u32 s2, s3 -; GFX9-NEXT: s_cselect_b32 s2, s4, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: v_readfirstlane_b32 s1, v0 +; GFX9-NEXT: s_mul_i32 s0, s0, s1 +; GFX9-NEXT: s_mul_hi_u32 s0, s1, s0 +; GFX9-NEXT: s_add_i32 s1, s1, s0 +; GFX9-NEXT: s_mul_hi_u32 s0, s6, s1 +; GFX9-NEXT: s_mul_i32 s0, s0, s7 +; GFX9-NEXT: s_sub_i32 s0, s6, s0 +; GFX9-NEXT: s_sub_i32 s1, s0, s7 +; GFX9-NEXT: s_cmp_ge_u32 s0, s7 +; GFX9-NEXT: s_cselect_b32 s0, s1, s0 +; GFX9-NEXT: s_sub_i32 s1, s0, s7 +; GFX9-NEXT: s_cmp_ge_u32 s0, s7 +; GFX9-NEXT: s_cselect_b32 s0, s1, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-NEXT: s_endpgm %r = urem i32 %x, %y store i32 %r, ptr addrspace(1) %out @@ -280,37 +280,37 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX9-LABEL: sdiv_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_abs_i32 s4, s3 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX9-NEXT: s_sub_i32 s5, 0, s4 -; GFX9-NEXT: s_xor_b32 s3, s2, s3 -; GFX9-NEXT: s_abs_i32 s2, s2 +; GFX9-NEXT: s_abs_i32 s0, s7 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX9-NEXT: s_xor_b32 s1, s6, s7 +; GFX9-NEXT: s_abs_i32 s2, s6 +; GFX9-NEXT: s_sub_i32 s3, 0, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_ashr_i32 s3, s3, 31 +; GFX9-NEXT: s_ashr_i32 s1, s1, 31 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s6, v0 -; GFX9-NEXT: s_mul_i32 s5, s5, s6 -; GFX9-NEXT: s_mul_hi_u32 s5, s6, s5 -; GFX9-NEXT: s_add_i32 s6, s6, s5 -; GFX9-NEXT: s_mul_hi_u32 s5, s2, s6 -; GFX9-NEXT: s_mul_i32 s6, s5, s4 +; GFX9-NEXT: s_mul_i32 s3, s3, s6 +; GFX9-NEXT: s_mul_hi_u32 s3, s6, s3 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_mul_hi_u32 s3, s2, s6 +; GFX9-NEXT: s_mul_i32 s6, s3, s0 ; GFX9-NEXT: s_sub_i32 s2, s2, s6 -; GFX9-NEXT: s_add_i32 s7, s5, 1 -; GFX9-NEXT: s_sub_i32 s6, s2, s4 -; GFX9-NEXT: s_cmp_ge_u32 s2, s4 -; GFX9-NEXT: s_cselect_b32 s5, s7, s5 +; GFX9-NEXT: s_add_i32 s7, s3, 1 +; GFX9-NEXT: s_sub_i32 s6, s2, s0 +; GFX9-NEXT: s_cmp_ge_u32 s2, s0 +; GFX9-NEXT: s_cselect_b32 s3, s7, s3 ; GFX9-NEXT: s_cselect_b32 s2, s6, s2 -; GFX9-NEXT: s_add_i32 s6, s5, 1 -; GFX9-NEXT: s_cmp_ge_u32 s2, s4 -; GFX9-NEXT: s_cselect_b32 s2, s6, s5 -; GFX9-NEXT: s_xor_b32 s2, s2, s3 -; GFX9-NEXT: s_sub_i32 s2, s2, s3 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: s_add_i32 s6, s3, 1 +; GFX9-NEXT: s_cmp_ge_u32 s2, s0 +; GFX9-NEXT: s_cselect_b32 s0, s6, s3 +; GFX9-NEXT: s_xor_b32 s0, s0, s1 +; GFX9-NEXT: s_sub_i32 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-NEXT: s_endpgm %r = sdiv i32 %x, %y store i32 %r, ptr addrspace(1) %out @@ -394,34 +394,34 @@ define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX9-LABEL: srem_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_abs_i32 s3, s3 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX9-NEXT: s_sub_i32 s5, 0, s3 -; GFX9-NEXT: s_ashr_i32 s4, s2, 31 -; GFX9-NEXT: s_abs_i32 s2, s2 +; GFX9-NEXT: s_abs_i32 s0, s7 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX9-NEXT: s_ashr_i32 s1, s6, 31 +; GFX9-NEXT: s_abs_i32 s2, s6 +; GFX9-NEXT: s_sub_i32 s3, 0, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s6, v0 -; GFX9-NEXT: s_mul_i32 s5, s5, s6 -; GFX9-NEXT: s_mul_hi_u32 s5, s6, s5 -; GFX9-NEXT: s_add_i32 s6, s6, s5 -; GFX9-NEXT: s_mul_hi_u32 s5, s2, s6 -; GFX9-NEXT: s_mul_i32 s5, s5, s3 -; GFX9-NEXT: s_sub_i32 s2, s2, s5 -; GFX9-NEXT: s_sub_i32 s5, s2, s3 -; GFX9-NEXT: s_cmp_ge_u32 s2, s3 -; GFX9-NEXT: s_cselect_b32 s2, s5, s2 -; GFX9-NEXT: s_sub_i32 s5, s2, s3 -; GFX9-NEXT: s_cmp_ge_u32 s2, s3 -; GFX9-NEXT: s_cselect_b32 s2, s5, s2 -; GFX9-NEXT: s_xor_b32 s2, s2, s4 -; GFX9-NEXT: s_sub_i32 s2, s2, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: s_mul_i32 s3, s3, s6 +; GFX9-NEXT: s_mul_hi_u32 s3, s6, s3 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_mul_hi_u32 s3, s2, s6 +; GFX9-NEXT: s_mul_i32 s3, s3, s0 +; GFX9-NEXT: s_sub_i32 s2, s2, s3 +; GFX9-NEXT: s_sub_i32 s3, s2, s0 +; GFX9-NEXT: s_cmp_ge_u32 s2, s0 +; GFX9-NEXT: s_cselect_b32 s2, s3, s2 +; GFX9-NEXT: s_sub_i32 s3, s2, s0 +; GFX9-NEXT: s_cmp_ge_u32 s2, s0 +; GFX9-NEXT: s_cselect_b32 s0, s3, s2 +; GFX9-NEXT: s_xor_b32 s0, s0, s1 +; GFX9-NEXT: s_sub_i32 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-NEXT: s_endpgm %r = srem i32 %x, %y store i32 %r, ptr addrspace(1) %out @@ -5482,13 +5482,13 @@ define amdgpu_kernel void @udiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; ; GFX9-LABEL: udiv_i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_i32 s3, s3, 12 -; GFX9-NEXT: s_lshr_b32 s2, s2, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_add_i32 s0, s7, 12 +; GFX9-NEXT: s_lshr_b32 s0, s6, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm %shl.y = shl i32 4096, %y %r = udiv i32 %x, %shl.y @@ -5524,14 +5524,14 @@ define amdgpu_kernel void @udiv_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3 ; ; GFX9-LABEL: udiv_v2i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s2, s2, 12 -; GFX9-NEXT: s_lshr_b32 s3, s3, 12 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_lshr_b32 s0, s6, 12 +; GFX9-NEXT: s_lshr_b32 s1, s7, 12 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm %r = udiv <2 x i32> %x, store <2 x i32> %r, ptr addrspace(1) %out @@ -5570,18 +5570,18 @@ define amdgpu_kernel void @udiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out, < ; ; GFX9-LABEL: udiv_v2i32_mixed_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_hi_u32 s4, s3, 0x100101 -; GFX9-NEXT: s_sub_i32 s3, s3, s4 -; GFX9-NEXT: s_lshr_b32 s3, s3, 1 -; GFX9-NEXT: s_add_i32 s3, s3, s4 -; GFX9-NEXT: s_lshr_b32 s2, s2, 12 -; GFX9-NEXT: s_lshr_b32 s3, s3, 11 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_mul_hi_u32 s1, s7, 0x100101 +; GFX9-NEXT: s_sub_i32 s2, s7, s1 +; GFX9-NEXT: s_lshr_b32 s2, s2, 1 +; GFX9-NEXT: s_add_i32 s2, s2, s1 +; GFX9-NEXT: s_lshr_b32 s0, s6, 12 +; GFX9-NEXT: s_lshr_b32 s1, s2, 11 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm %r = udiv <2 x i32> %x, store <2 x i32> %r, ptr addrspace(1) %out @@ -5875,14 +5875,14 @@ define amdgpu_kernel void @urem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; ; GFX9-LABEL: urem_i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3 -; GFX9-NEXT: s_add_i32 s3, s3, -1 -; GFX9-NEXT: s_and_b32 s2, s2, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_lshl_b32 s0, 0x1000, s7 +; GFX9-NEXT: s_add_i32 s0, s0, -1 +; GFX9-NEXT: s_and_b32 s0, s6, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm %shl.y = shl i32 4096, %y %r = urem i32 %x, %shl.y @@ -5918,14 +5918,14 @@ define amdgpu_kernel void @urem_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3 ; ; GFX9-LABEL: urem_v2i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s2, s2, 0xfff -; GFX9-NEXT: s_and_b32 s3, s3, 0xfff -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_and_b32 s0, s6, 0xfff +; GFX9-NEXT: s_and_b32 s1, s7, 0xfff +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm %r = urem <2 x i32> %x, store <2 x i32> %r, ptr addrspace(1) %out @@ -6234,41 +6234,41 @@ define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; ; GFX9-LABEL: sdiv_i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3 -; GFX9-NEXT: s_ashr_i32 s4, s3, 31 -; GFX9-NEXT: s_add_i32 s3, s3, s4 -; GFX9-NEXT: s_xor_b32 s3, s3, s4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX9-NEXT: s_sub_i32 s6, 0, s3 -; GFX9-NEXT: s_ashr_i32 s5, s2, 31 -; GFX9-NEXT: s_add_i32 s2, s2, s5 +; GFX9-NEXT: s_lshl_b32 s0, 0x1000, s7 +; GFX9-NEXT: s_ashr_i32 s1, s0, 31 +; GFX9-NEXT: s_add_i32 s0, s0, s1 +; GFX9-NEXT: s_xor_b32 s0, s0, s1 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX9-NEXT: s_ashr_i32 s2, s6, 31 +; GFX9-NEXT: s_add_i32 s3, s6, s2 +; GFX9-NEXT: s_sub_i32 s6, 0, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_xor_b32 s2, s2, s5 +; GFX9-NEXT: s_xor_b32 s3, s3, s2 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s7, v0 ; GFX9-NEXT: s_mul_i32 s6, s6, s7 ; GFX9-NEXT: s_mul_hi_u32 s6, s7, s6 ; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_mul_hi_u32 s6, s2, s7 -; GFX9-NEXT: s_mul_i32 s8, s6, s3 -; GFX9-NEXT: s_sub_i32 s2, s2, s8 +; GFX9-NEXT: s_mul_hi_u32 s6, s3, s7 +; GFX9-NEXT: s_mul_i32 s8, s6, s0 +; GFX9-NEXT: s_sub_i32 s3, s3, s8 ; GFX9-NEXT: s_add_i32 s7, s6, 1 -; GFX9-NEXT: s_sub_i32 s8, s2, s3 -; GFX9-NEXT: s_cmp_ge_u32 s2, s3 +; GFX9-NEXT: s_sub_i32 s8, s3, s0 +; GFX9-NEXT: s_cmp_ge_u32 s3, s0 ; GFX9-NEXT: s_cselect_b32 s6, s7, s6 -; GFX9-NEXT: s_cselect_b32 s2, s8, s2 +; GFX9-NEXT: s_cselect_b32 s3, s8, s3 ; GFX9-NEXT: s_add_i32 s7, s6, 1 -; GFX9-NEXT: s_cmp_ge_u32 s2, s3 -; GFX9-NEXT: s_cselect_b32 s2, s7, s6 -; GFX9-NEXT: s_xor_b32 s3, s5, s4 -; GFX9-NEXT: s_xor_b32 s2, s2, s3 -; GFX9-NEXT: s_sub_i32 s2, s2, s3 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: s_cmp_ge_u32 s3, s0 +; GFX9-NEXT: s_cselect_b32 s0, s7, s6 +; GFX9-NEXT: s_xor_b32 s1, s2, s1 +; GFX9-NEXT: s_xor_b32 s0, s0, s1 +; GFX9-NEXT: s_sub_i32 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-NEXT: s_endpgm %shl.y = shl i32 4096, %y %r = sdiv i32 %x, %shl.y @@ -6310,20 +6310,20 @@ define amdgpu_kernel void @sdiv_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3 ; ; GFX9-LABEL: sdiv_v2i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s4, s2, 31 -; GFX9-NEXT: s_ashr_i32 s5, s3, 31 -; GFX9-NEXT: s_lshr_b32 s4, s4, 20 -; GFX9-NEXT: s_lshr_b32 s5, s5, 20 -; GFX9-NEXT: s_add_i32 s2, s2, s4 -; GFX9-NEXT: s_add_i32 s3, s3, s5 -; GFX9-NEXT: s_ashr_i32 s2, s2, 12 -; GFX9-NEXT: s_ashr_i32 s3, s3, 12 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_ashr_i32 s0, s6, 31 +; GFX9-NEXT: s_ashr_i32 s1, s7, 31 +; GFX9-NEXT: s_lshr_b32 s0, s0, 20 +; GFX9-NEXT: s_lshr_b32 s1, s1, 20 +; GFX9-NEXT: s_add_i32 s0, s6, s0 +; GFX9-NEXT: s_add_i32 s1, s7, s1 +; GFX9-NEXT: s_ashr_i32 s0, s0, 12 +; GFX9-NEXT: s_ashr_i32 s1, s1, 12 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm %r = sdiv <2 x i32> %x, store <2 x i32> %r, ptr addrspace(1) %out @@ -6365,21 +6365,21 @@ define amdgpu_kernel void @ssdiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out, ; ; GFX9-LABEL: ssdiv_v2i32_mixed_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s4, s2, 31 -; GFX9-NEXT: s_mul_hi_i32 s5, s3, 0x80080081 -; GFX9-NEXT: s_lshr_b32 s4, s4, 20 -; GFX9-NEXT: s_add_i32 s5, s5, s3 -; GFX9-NEXT: s_add_i32 s2, s2, s4 -; GFX9-NEXT: s_lshr_b32 s3, s5, 31 -; GFX9-NEXT: s_ashr_i32 s4, s5, 11 -; GFX9-NEXT: s_ashr_i32 s2, s2, 12 -; GFX9-NEXT: s_add_i32 s4, s4, s3 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_ashr_i32 s0, s6, 31 +; GFX9-NEXT: s_mul_hi_i32 s1, s7, 0x80080081 +; GFX9-NEXT: s_lshr_b32 s0, s0, 20 +; GFX9-NEXT: s_add_i32 s1, s1, s7 +; GFX9-NEXT: s_add_i32 s0, s6, s0 +; GFX9-NEXT: s_lshr_b32 s2, s1, 31 +; GFX9-NEXT: s_ashr_i32 s1, s1, 11 +; GFX9-NEXT: s_ashr_i32 s0, s0, 12 +; GFX9-NEXT: s_add_i32 s1, s1, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm %r = sdiv <2 x i32> %x, store <2 x i32> %r, ptr addrspace(1) %out @@ -6746,38 +6746,38 @@ define amdgpu_kernel void @srem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; ; GFX9-LABEL: srem_i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3 -; GFX9-NEXT: s_ashr_i32 s4, s3, 31 -; GFX9-NEXT: s_add_i32 s3, s3, s4 -; GFX9-NEXT: s_xor_b32 s3, s3, s4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX9-NEXT: s_sub_i32 s5, 0, s3 -; GFX9-NEXT: s_ashr_i32 s4, s2, 31 -; GFX9-NEXT: s_add_i32 s2, s2, s4 +; GFX9-NEXT: s_lshl_b32 s0, 0x1000, s7 +; GFX9-NEXT: s_ashr_i32 s1, s0, 31 +; GFX9-NEXT: s_add_i32 s0, s0, s1 +; GFX9-NEXT: s_xor_b32 s0, s0, s1 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX9-NEXT: s_ashr_i32 s1, s6, 31 +; GFX9-NEXT: s_add_i32 s2, s6, s1 +; GFX9-NEXT: s_sub_i32 s3, 0, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_xor_b32 s2, s2, s4 +; GFX9-NEXT: s_xor_b32 s2, s2, s1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s6, v0 -; GFX9-NEXT: s_mul_i32 s5, s5, s6 -; GFX9-NEXT: s_mul_hi_u32 s5, s6, s5 -; GFX9-NEXT: s_add_i32 s6, s6, s5 -; GFX9-NEXT: s_mul_hi_u32 s5, s2, s6 -; GFX9-NEXT: s_mul_i32 s5, s5, s3 -; GFX9-NEXT: s_sub_i32 s2, s2, s5 -; GFX9-NEXT: s_sub_i32 s5, s2, s3 -; GFX9-NEXT: s_cmp_ge_u32 s2, s3 -; GFX9-NEXT: s_cselect_b32 s2, s5, s2 -; GFX9-NEXT: s_sub_i32 s5, s2, s3 -; GFX9-NEXT: s_cmp_ge_u32 s2, s3 -; GFX9-NEXT: s_cselect_b32 s2, s5, s2 -; GFX9-NEXT: s_xor_b32 s2, s2, s4 -; GFX9-NEXT: s_sub_i32 s2, s2, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: s_mul_i32 s3, s3, s6 +; GFX9-NEXT: s_mul_hi_u32 s3, s6, s3 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_mul_hi_u32 s3, s2, s6 +; GFX9-NEXT: s_mul_i32 s3, s3, s0 +; GFX9-NEXT: s_sub_i32 s2, s2, s3 +; GFX9-NEXT: s_sub_i32 s3, s2, s0 +; GFX9-NEXT: s_cmp_ge_u32 s2, s0 +; GFX9-NEXT: s_cselect_b32 s2, s3, s2 +; GFX9-NEXT: s_sub_i32 s3, s2, s0 +; GFX9-NEXT: s_cmp_ge_u32 s2, s0 +; GFX9-NEXT: s_cselect_b32 s0, s3, s2 +; GFX9-NEXT: s_xor_b32 s0, s0, s1 +; GFX9-NEXT: s_sub_i32 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-NEXT: s_endpgm %shl.y = shl i32 4096, %y %r = srem i32 %x, %shl.y @@ -6821,22 +6821,22 @@ define amdgpu_kernel void @srem_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3 ; ; GFX9-LABEL: srem_v2i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s4, s2, 31 -; GFX9-NEXT: s_ashr_i32 s5, s3, 31 -; GFX9-NEXT: s_lshr_b32 s4, s4, 20 -; GFX9-NEXT: s_lshr_b32 s5, s5, 20 -; GFX9-NEXT: s_add_i32 s4, s2, s4 -; GFX9-NEXT: s_add_i32 s5, s3, s5 -; GFX9-NEXT: s_and_b32 s4, s4, 0xfffff000 -; GFX9-NEXT: s_sub_i32 s2, s2, s4 -; GFX9-NEXT: s_and_b32 s4, s5, 0xfffff000 -; GFX9-NEXT: s_sub_i32 s3, s3, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_ashr_i32 s0, s6, 31 +; GFX9-NEXT: s_ashr_i32 s1, s7, 31 +; GFX9-NEXT: s_lshr_b32 s0, s0, 20 +; GFX9-NEXT: s_lshr_b32 s1, s1, 20 +; GFX9-NEXT: s_add_i32 s0, s6, s0 +; GFX9-NEXT: s_add_i32 s1, s7, s1 +; GFX9-NEXT: s_and_b32 s0, s0, 0xfffff000 +; GFX9-NEXT: s_and_b32 s1, s1, 0xfffff000 +; GFX9-NEXT: s_sub_i32 s0, s6, s0 +; GFX9-NEXT: s_sub_i32 s1, s7, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm %r = srem <2 x i32> %x, store <2 x i32> %r, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/build_vector.ll b/llvm/test/CodeGen/AMDGPU/build_vector.ll index b26d15ed3a1c8..e914635d6c26f 100644 --- a/llvm/test/CodeGen/AMDGPU/build_vector.ll +++ b/llvm/test/CodeGen/AMDGPU/build_vector.ll @@ -316,14 +316,14 @@ define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out, ; ; GFX940-LABEL: build_v2i32_from_v4i16_shuffle: ; GFX940: ; %bb.0: ; %entry -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_lshl_b32 s3, s3, 16 -; GFX940-NEXT: s_lshl_b32 s2, s2, 16 -; GFX940-NEXT: v_mov_b32_e32 v0, s2 -; GFX940-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_lshl_b32 s0, s7, 16 +; GFX940-NEXT: s_lshl_b32 s1, s6, 16 +; GFX940-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] sc0 sc1 ; GFX940-NEXT: s_endpgm entry: %shuf = shufflevector <4 x i16> %in, <4 x i16> zeroinitializer, <2 x i32> diff --git a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll index b6948dab6bf9f..d511bb1f4a257 100644 --- a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll +++ b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll @@ -49,20 +49,20 @@ define amdgpu_kernel void @cluster_load_cluster_store(ptr noalias %lb, ptr noali ; ; GFX10-LABEL: cluster_load_cluster_store: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_u32 s4, s0, 8 -; GFX10-NEXT: s_addc_u32 s5, s1, 0 -; GFX10-NEXT: s_add_u32 s6, s0, 16 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: s_addc_u32 s7, s1, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: s_add_u32 s0, s0, 24 -; GFX10-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: v_mov_b32_e32 v3, s5 -; GFX10-NEXT: v_mov_b32_e32 v4, s6 -; GFX10-NEXT: v_mov_b32_e32 v5, s7 +; GFX10-NEXT: s_add_u32 s0, s4, 8 +; GFX10-NEXT: s_addc_u32 s1, s5, 0 +; GFX10-NEXT: s_add_u32 s2, s4, 16 +; GFX10-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-NEXT: s_addc_u32 s3, s5, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: s_add_u32 s0, s4, 24 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: s_addc_u32 s1, s5, 0 +; GFX10-NEXT: v_mov_b32_e32 v5, s3 +; GFX10-NEXT: v_mov_b32_e32 v4, s2 ; GFX10-NEXT: v_mov_b32_e32 v7, s1 ; GFX10-NEXT: v_mov_b32_e32 v6, s0 ; GFX10-NEXT: s_clause 0x3 @@ -70,16 +70,16 @@ define amdgpu_kernel void @cluster_load_cluster_store(ptr noalias %lb, ptr noali ; GFX10-NEXT: flat_load_dword v9, v[2:3] ; GFX10-NEXT: flat_load_dword v10, v[4:5] ; GFX10-NEXT: flat_load_dword v11, v[6:7] -; GFX10-NEXT: s_add_u32 s0, s2, 8 -; GFX10-NEXT: s_addc_u32 s1, s3, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: s_add_u32 s0, s6, 8 +; GFX10-NEXT: s_addc_u32 s1, s7, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: s_add_u32 s0, s2, 16 -; GFX10-NEXT: s_addc_u32 s1, s3, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: s_add_u32 s2, s2, 24 -; GFX10-NEXT: s_addc_u32 s3, s3, 0 +; GFX10-NEXT: s_add_u32 s0, s6, 16 +; GFX10-NEXT: s_addc_u32 s1, s7, 0 +; GFX10-NEXT: s_add_u32 s2, s6, 24 +; GFX10-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-NEXT: s_addc_u32 s3, s7, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, s1 ; GFX10-NEXT: v_mov_b32_e32 v4, s0 ; GFX10-NEXT: v_mov_b32_e32 v7, s3 @@ -175,20 +175,20 @@ define amdgpu_kernel void @cluster_load_valu_cluster_store(ptr noalias %lb, ptr ; ; GFX10-LABEL: cluster_load_valu_cluster_store: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_u32 s4, s0, 8 -; GFX10-NEXT: s_addc_u32 s5, s1, 0 -; GFX10-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-NEXT: s_add_u32 s6, s0, 16 -; GFX10-NEXT: v_mov_b32_e32 v3, s5 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: s_addc_u32 s7, s1, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: s_add_u32 s0, s0, 24 -; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: v_mov_b32_e32 v4, s6 -; GFX10-NEXT: v_mov_b32_e32 v5, s7 +; GFX10-NEXT: s_add_u32 s0, s4, 8 +; GFX10-NEXT: s_addc_u32 s1, s5, 0 +; GFX10-NEXT: s_add_u32 s2, s4, 16 +; GFX10-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: s_addc_u32 s3, s5, 0 +; GFX10-NEXT: s_add_u32 s0, s4, 24 +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: s_addc_u32 s1, s5, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: v_mov_b32_e32 v5, s3 +; GFX10-NEXT: v_mov_b32_e32 v4, s2 ; GFX10-NEXT: flat_load_dword v6, v[2:3] ; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 @@ -196,18 +196,18 @@ define amdgpu_kernel void @cluster_load_valu_cluster_store(ptr noalias %lb, ptr ; GFX10-NEXT: flat_load_dword v8, v[0:1] ; GFX10-NEXT: flat_load_dword v9, v[4:5] ; GFX10-NEXT: flat_load_dword v10, v[2:3] -; GFX10-NEXT: s_add_u32 s0, s2, 8 -; GFX10-NEXT: s_addc_u32 s1, s3, 0 -; GFX10-NEXT: s_add_u32 s4, s2, 16 +; GFX10-NEXT: s_add_u32 s0, s6, 8 +; GFX10-NEXT: s_addc_u32 s1, s7, 0 +; GFX10-NEXT: s_add_u32 s2, s6, 16 ; GFX10-NEXT: v_mov_b32_e32 v3, s1 -; GFX10-NEXT: s_addc_u32 s5, s3, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: s_addc_u32 s3, s7, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: s_add_u32 s0, s2, 24 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: s_addc_u32 s1, s3, 0 -; GFX10-NEXT: v_mov_b32_e32 v5, s5 +; GFX10-NEXT: s_add_u32 s0, s6, 24 +; GFX10-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-NEXT: v_mov_b32_e32 v5, s3 +; GFX10-NEXT: s_addc_u32 s1, s7, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, s2 ; GFX10-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) ; GFX10-NEXT: v_add_nc_u32_e32 v11, 1, v6 ; GFX10-NEXT: v_mov_b32_e32 v7, s1 diff --git a/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll b/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll index 33c0d90f94a39..9c7fa1537c0c2 100644 --- a/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll +++ b/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll @@ -665,17 +665,17 @@ define amdgpu_kernel void @sub_zext_setcc_commute(ptr addrspace(1) nocapture %ar ; ; GFX9-LABEL: sub_zext_setcc_commute: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v3, v2, s[0:1] +; GFX9-NEXT: global_load_dword v3, v2, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v3 -; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 -; GFX9-NEXT: v_subrev_u32_e32 v0, s3, v0 -; GFX9-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9-NEXT: v_add_u32_e32 v0, s6, v0 +; GFX9-NEXT: v_subrev_u32_e32 v0, s7, v0 +; GFX9-NEXT: global_store_dword v2, v0, s[4:5] ; GFX9-NEXT: s_endpgm bb: %x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -714,17 +714,17 @@ define amdgpu_kernel void @sub_sext_setcc_commute(ptr addrspace(1) nocapture %ar ; ; GFX9-LABEL: sub_sext_setcc_commute: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v3, v2, s[0:1] +; GFX9-NEXT: global_load_dword v3, v2, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v3 -; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 -; GFX9-NEXT: v_subrev_u32_e32 v0, s3, v0 -; GFX9-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9-NEXT: v_add_u32_e32 v0, s6, v0 +; GFX9-NEXT: v_subrev_u32_e32 v0, s7, v0 +; GFX9-NEXT: global_store_dword v2, v0, s[4:5] ; GFX9-NEXT: s_endpgm bb: %x = tail call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll index 4decf39d04013..3145ee1f6141e 100644 --- a/llvm/test/CodeGen/AMDGPU/ctlz.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll @@ -164,28 +164,28 @@ define amdgpu_kernel void @v_ctlz_i32(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX10-LABEL: v_ctlz_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_ctlz_i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_ctlz_i32: @@ -277,32 +277,32 @@ define amdgpu_kernel void @v_ctlz_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX10-LABEL: v_ctlz_v2i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v1, v1 ; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX10-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_ctlz_v2i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] +; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_ctlz_v2i32: @@ -411,11 +411,11 @@ define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX10-LABEL: v_ctlz_v4i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] +; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v3, v3 ; GFX10-NEXT: v_ffbh_u32_e32 v2, v2 @@ -425,16 +425,16 @@ define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrs ; GFX10-NEXT: v_min_u32_e32 v2, 32, v2 ; GFX10-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 -; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_ctlz_v4i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] +; GFX10-GISEL-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 @@ -444,7 +444,7 @@ define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrs ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-GISEL-NEXT: v_min_u32_e32 v2, 32, v2 ; GFX10-GISEL-NEXT: v_min_u32_e32 v3, 32, v3 -; GFX10-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX10-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_ctlz_v4i32: @@ -550,28 +550,28 @@ define amdgpu_kernel void @v_ctlz_i8(ptr addrspace(1) noalias %out, ptr addrspac ; ; GFX10-LABEL: v_ctlz_i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX10-NEXT: global_load_ubyte v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v1, v1 ; GFX10-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 24, v1 -; GFX10-NEXT: global_store_byte v0, v1, s[0:1] +; GFX10-NEXT: global_store_byte v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_ctlz_i8: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX10-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v1, 24, v1 -; GFX10-GISEL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX10-GISEL-NEXT: global_store_byte v0, v1, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_ctlz_i8: @@ -737,24 +737,24 @@ define amdgpu_kernel void @s_ctlz_i64_trunc(ptr addrspace(1) noalias %out, i64 % ; ; GFX10-LABEL: s_ctlz_i64_trunc: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_flbit_i32_b64 s2, s[2:3] -; GFX10-NEXT: s_min_u32 s2, s2, 64 -; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_flbit_i32_b64 s0, s[6:7] +; GFX10-NEXT: s_min_u32 s0, s0, 64 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: s_ctlz_i64_trunc: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: s_flbit_i32_b64 s2, s[2:3] -; GFX10-GISEL-NEXT: s_min_u32 s2, s2, 64 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: s_flbit_i32_b64 s0, s[6:7] +; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 64 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: s_ctlz_i64_trunc: @@ -847,25 +847,25 @@ define amdgpu_kernel void @v_ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX10-LABEL: v_ctlz_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX10-NEXT: v_ffbh_u32_e32 v1, v1 ; GFX10-NEXT: v_add_nc_u32_e64 v0, v0, 32 clamp ; GFX10-NEXT: v_min3_u32 v0, v0, v1, 64 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_ctlz_i64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] +; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 @@ -873,7 +873,7 @@ define amdgpu_kernel void @v_ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspa ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, v1, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 64, v0 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_ctlz_i64: @@ -974,33 +974,33 @@ define amdgpu_kernel void @v_ctlz_i64_trunc(ptr addrspace(1) noalias %out, ptr a ; ; GFX10-LABEL: v_ctlz_i64_trunc: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[1:2], v1, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[1:2], v1, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v1, v1 ; GFX10-NEXT: v_ffbh_u32_e32 v2, v2 ; GFX10-NEXT: v_add_nc_u32_e64 v1, v1, 32 clamp ; GFX10-NEXT: v_min3_u32 v1, v1, v2, 64 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_ctlz_i64_trunc: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dwordx2 v[1:2], v1, s[2:3] +; GFX10-GISEL-NEXT: global_load_dwordx2 v[1:2], v1, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v2, v2 ; GFX10-GISEL-NEXT: v_add_nc_u32_e64 v1, v1, 32 clamp ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, v2, v1 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 64, v1 -; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_ctlz_i64_trunc: @@ -1090,29 +1090,29 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-LABEL: v_ctlz_i32_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_ctlz_i32_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc_lo ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_ctlz_i32_sel_eq_neg1: @@ -1197,29 +1197,29 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-LABEL: v_ctlz_i32_sel_ne_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_ctlz_i32_sel_ne_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 ; GFX10-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v1, vcc_lo ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_ctlz_i32_sel_ne_neg1: @@ -1313,32 +1313,32 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_ctlz_i32_sel_eq_bitwidth: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_ctlz_i32_sel_eq_bitwidth: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 32, v0 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_ctlz_i32_sel_eq_bitwidth: @@ -1435,32 +1435,32 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_ctlz_i32_sel_ne_bitwidth: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_ctlz_i32_sel_ne_bitwidth: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX10-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 ; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_ctlz_i32_sel_ne_bitwidth: @@ -1552,22 +1552,22 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_ctlz_i8_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] +; GFX10-NEXT: global_load_ubyte v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 -; GFX10-NEXT: global_store_byte v1, v0, s[0:1] +; GFX10-NEXT: global_store_byte v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_ctlz_i8_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s3 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s7 ; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 ; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo ; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off @@ -1578,7 +1578,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v1, 24, v1 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc_lo ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[0:1] +; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_ctlz_i8_sel_eq_neg1: @@ -1674,25 +1674,25 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_ctlz_i16_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v2, v1 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 ; GFX10-NEXT: v_min_u32_e32 v2, 32, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v2, -16, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo -; GFX10-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_ctlz_i16_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX10-GISEL-NEXT: global_load_ushort v1, v0, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v2, v1 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 @@ -1700,7 +1700,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v2, 16, v2 ; GFX10-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v2, 0xffff, vcc_lo -; GFX10-GISEL-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-GISEL-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_ctlz_i16_sel_eq_neg1: @@ -1795,23 +1795,23 @@ define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-LABEL: v_ctlz_i7_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] +; GFX10-NEXT: global_load_ubyte v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX10-NEXT: v_and_b32_e32 v0, 0x7f, v0 -; GFX10-NEXT: global_store_byte v1, v0, s[0:1] +; GFX10-NEXT: global_store_byte v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_ctlz_i7_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s3 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s7 ; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 ; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo ; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off @@ -1824,7 +1824,7 @@ define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0x7f, vcc_lo ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0x7f, v0 -; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[0:1] +; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_ctlz_i7_sel_eq_neg1: diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll index 756b819099682..a377714ebf737 100644 --- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll @@ -134,14 +134,14 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[6:7] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid @@ -211,15 +211,15 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(ptr addrspace(1) noalias %out ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 -; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr <2 x i32>, ptr addrspace(1) %valptr, i32 %tid @@ -295,17 +295,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(ptr addrspace(1) noalias %out ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v4i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v2 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v3, v3 -; GFX9-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX9-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr <4 x i32>, ptr addrspace(1) %valptr, i32 %tid @@ -562,14 +562,14 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i64_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-GISEL-NEXT: s_mov_b32 s5, 0 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_mov_b32 s1, 0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_flbit_i32_b64 s4, s[2:3] -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-GISEL-NEXT: s_flbit_i32_b64 s0, s[6:7] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %ctlz = tail call i64 @llvm.ctlz.i64(i64 %val, i1 true) nounwind readnone %ctlz_ret = icmp ne i64 %val, 0 @@ -650,17 +650,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i8_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v1 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v2 ; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc -; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %val = load i8, ptr addrspace(1) %arrayidx, align 1 %ctlz = tail call i8 @llvm.ctlz.i8(i8 %val, i1 true) nounwind readnone @@ -754,11 +754,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i16_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] -; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 +; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] +; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[6:7] offset:1 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v1 @@ -766,7 +766,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc -; GFX9-GISEL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-GISEL-NEXT: global_store_short v0, v1, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %val = load i16, ptr addrspace(1) %arrayidx, align 1 %ctlz = tail call i16 @llvm.ctlz.i16(i16 %val, i1 true) nounwind readnone @@ -870,13 +870,13 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] -; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 -; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[2:3] offset:3 -; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[2:3] offset:2 +; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] +; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[6:7] offset:1 +; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[6:7] offset:3 +; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[6:7] offset:2 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1) @@ -887,7 +887,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v1 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %val = load i32, ptr addrspace(1) %arrayidx, align 1 %ctlz = tail call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone @@ -1051,17 +1051,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v0, v1, s[2:3] -; GFX9-GISEL-NEXT: global_load_ubyte v2, v1, s[2:3] offset:1 -; GFX9-GISEL-NEXT: global_load_ubyte v3, v1, s[2:3] offset:2 -; GFX9-GISEL-NEXT: global_load_ubyte v4, v1, s[2:3] offset:3 -; GFX9-GISEL-NEXT: global_load_ubyte v5, v1, s[2:3] offset:4 -; GFX9-GISEL-NEXT: global_load_ubyte v6, v1, s[2:3] offset:5 -; GFX9-GISEL-NEXT: global_load_ubyte v7, v1, s[2:3] offset:6 -; GFX9-GISEL-NEXT: global_load_ubyte v8, v1, s[2:3] offset:7 +; GFX9-GISEL-NEXT: global_load_ubyte v0, v1, s[6:7] +; GFX9-GISEL-NEXT: global_load_ubyte v2, v1, s[6:7] offset:1 +; GFX9-GISEL-NEXT: global_load_ubyte v3, v1, s[6:7] offset:2 +; GFX9-GISEL-NEXT: global_load_ubyte v4, v1, s[6:7] offset:3 +; GFX9-GISEL-NEXT: global_load_ubyte v5, v1, s[6:7] offset:4 +; GFX9-GISEL-NEXT: global_load_ubyte v6, v1, s[6:7] offset:5 +; GFX9-GISEL-NEXT: global_load_ubyte v7, v1, s[6:7] offset:6 +; GFX9-GISEL-NEXT: global_load_ubyte v8, v1, s[6:7] offset:7 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(6) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v2, 8, v0 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(5) @@ -1082,7 +1082,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no ; GFX9-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; GFX9-GISEL-NEXT: v_min_u32_e32 v0, v4, v0 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 64, v0, vcc -; GFX9-GISEL-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] +; GFX9-GISEL-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %val = load i64, ptr addrspace(1) %arrayidx, align 1 %ctlz = tail call i64 @llvm.ctlz.i64(i64 %val, i1 true) nounwind readnone @@ -1159,11 +1159,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i8: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s7 ; GFX9-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v1, v0 ; GFX9-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v3, vcc ; GFX9-GISEL-NEXT: global_load_ubyte v0, v[0:1], off @@ -1171,7 +1171,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 -; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[0:1] +; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i8, ptr addrspace(1) %valptr, i32 %tid @@ -1283,12 +1283,12 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias ; ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i64_trunc: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_flbit_i32_b64 s2, s[2:3] -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-GISEL-NEXT: s_flbit_i32_b64 s0, s[6:7] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true) %trunc = trunc i64 %ctlz to i32 @@ -1365,17 +1365,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] +; GFX9-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 ; GFX9-GISEL-NEXT: v_add_u32_e32 v0, 32, v0 ; GFX9-GISEL-NEXT: v_min_u32_e32 v0, v1, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid @@ -1455,17 +1455,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64_trunc: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dwordx2 v[1:2], v1, s[2:3] +; GFX9-GISEL-NEXT: global_load_dwordx2 v[1:2], v1, s[6:7] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v2 ; GFX9-GISEL-NEXT: v_add_u32_e32 v1, 32, v1 ; GFX9-GISEL-NEXT: v_min_u32_e32 v1, v2, v1 -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid @@ -1535,16 +1535,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[6:7] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid @@ -1614,16 +1614,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[6:7] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v1, vcc ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid @@ -1698,11 +1698,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noa ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i8_sel_eq_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s7 ; GFX9-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v1, v0 ; GFX9-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v3, vcc ; GFX9-GISEL-NEXT: global_load_ubyte v0, v[0:1], off @@ -1710,9 +1710,9 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noa ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v0 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v2 -; GFX9-GISEL-NEXT: v_cmp_eq_u32_sdwa s[2:3], v0, v1 src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, -1, s[2:3] -; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[0:1] +; GFX9-GISEL-NEXT: v_cmp_eq_u32_sdwa s[0:1], v0, v1 src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, -1, s[0:1] +; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %valptr.gep = getelementptr i8, ptr addrspace(1) %valptr, i32 %tid @@ -1800,17 +1800,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(ptr addrspa ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1_two_use: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[6:7] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v0 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc ; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) @@ -1889,16 +1889,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(ptr addrspace(1) noali ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[6:7] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid @@ -1973,16 +1973,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(ptr addrspace(1) noali ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[6:7] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid @@ -2058,16 +2058,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(ptr addrspace(1 ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_cmp_non0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[6:7] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid @@ -2143,16 +2143,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(ptr addrspace(1 ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_cmp_non0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[6:7] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/cttz.ll b/llvm/test/CodeGen/AMDGPU/cttz.ll index ee2894a66fbfc..ec532c8e4adc3 100644 --- a/llvm/test/CodeGen/AMDGPU/cttz.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz.ll @@ -148,28 +148,28 @@ define amdgpu_kernel void @v_cttz_i32(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX10-LABEL: v_cttz_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 ; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_cttz_i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid @@ -246,32 +246,32 @@ define amdgpu_kernel void @v_cttz_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX10-LABEL: v_cttz_v2i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbl_b32_e32 v1, v1 ; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 ; GFX10-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_cttz_v2i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] +; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr <2 x i32>, ptr addrspace(1) %valptr, i32 %tid @@ -362,11 +362,11 @@ define amdgpu_kernel void @v_cttz_v4i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX10-LABEL: v_cttz_v4i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] +; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbl_b32_e32 v3, v3 ; GFX10-NEXT: v_ffbl_b32_e32 v2, v2 @@ -376,16 +376,16 @@ define amdgpu_kernel void @v_cttz_v4i32(ptr addrspace(1) noalias %out, ptr addrs ; GFX10-NEXT: v_min_u32_e32 v2, 32, v2 ; GFX10-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 -; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_cttz_v4i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] +; GFX10-GISEL-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 @@ -395,7 +395,7 @@ define amdgpu_kernel void @v_cttz_v4i32(ptr addrspace(1) noalias %out, ptr addrs ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-GISEL-NEXT: v_min_u32_e32 v2, 32, v2 ; GFX10-GISEL-NEXT: v_min_u32_e32 v3, 32, v3 -; GFX10-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX10-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr <4 x i32>, ptr addrspace(1) %valptr, i32 %tid @@ -475,26 +475,26 @@ define amdgpu_kernel void @v_cttz_i8(ptr addrspace(1) noalias %out, ptr addrspac ; ; GFX10-LABEL: v_cttz_i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX10-NEXT: global_load_ubyte v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_or_b32_e32 v1, 0x100, v1 ; GFX10-NEXT: v_ffbl_b32_e32 v1, v1 -; GFX10-NEXT: global_store_byte v0, v1, s[0:1] +; GFX10-NEXT: global_store_byte v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_cttz_i8: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX10-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_or_b32_e32 v1, 0x100, v1 ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 -; GFX10-GISEL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX10-GISEL-NEXT: global_store_byte v0, v1, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm %val = load i8, ptr addrspace(1) %valptr %cttz = call i8 @llvm.cttz.i8(i8 %val, i1 false) nounwind readnone @@ -629,24 +629,24 @@ define amdgpu_kernel void @s_cttz_i64_trunc(ptr addrspace(1) noalias %out, i64 % ; ; GFX10-LABEL: s_cttz_i64_trunc: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_ff1_i32_b64 s2, s[2:3] -; GFX10-NEXT: s_min_u32 s2, s2, 64 -; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_ff1_i32_b64 s0, s[6:7] +; GFX10-NEXT: s_min_u32 s0, s0, 64 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: s_cttz_i64_trunc: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: s_ff1_i32_b64 s2, s[2:3] -; GFX10-GISEL-NEXT: s_min_u32 s2, s2, 64 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: s_ff1_i32_b64 s0, s[6:7] +; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 64 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm %cttz = call i64 @llvm.cttz.i64(i64 %val, i1 false) %trunc = trunc i64 %cttz to i32 @@ -726,25 +726,25 @@ define amdgpu_kernel void @v_cttz_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX10-LABEL: v_cttz_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbl_b32_e32 v1, v1 ; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 ; GFX10-NEXT: v_add_nc_u32_e64 v1, v1, 32 clamp ; GFX10-NEXT: v_min3_u32 v0, v0, v1, 64 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_cttz_i64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] +; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 @@ -752,7 +752,7 @@ define amdgpu_kernel void @v_cttz_i64(ptr addrspace(1) noalias %out, ptr addrspa ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, v0, v1 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 64, v0 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid @@ -835,33 +835,33 @@ define amdgpu_kernel void @v_cttz_i64_trunc(ptr addrspace(1) noalias %out, ptr a ; ; GFX10-LABEL: v_cttz_i64_trunc: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[1:2], v1, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[1:2], v1, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbl_b32_e32 v2, v2 ; GFX10-NEXT: v_ffbl_b32_e32 v1, v1 ; GFX10-NEXT: v_add_nc_u32_e64 v2, v2, 32 clamp ; GFX10-NEXT: v_min3_u32 v1, v1, v2, 64 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_cttz_i64_trunc: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dwordx2 v[1:2], v1, s[2:3] +; GFX10-GISEL-NEXT: global_load_dwordx2 v[1:2], v1, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v2, v2 ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 ; GFX10-GISEL-NEXT: v_add_nc_u32_e64 v2, v2, 32 clamp ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, v1, v2 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 64, v1 -; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid @@ -933,29 +933,29 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-LABEL: v_cttz_i32_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_cttz_i32_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v0 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc_lo ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid @@ -1027,29 +1027,29 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-LABEL: v_cttz_i32_sel_ne_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_cttz_i32_sel_ne_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v0 ; GFX10-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v1, vcc_lo ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid @@ -1130,32 +1130,32 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_cttz_i32_sel_eq_bitwidth: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 ; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_cttz_i32_sel_eq_bitwidth: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 32, v0 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid @@ -1235,32 +1235,32 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_cttz_i32_sel_ne_bitwidth: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 ; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_cttz_i32_sel_ne_bitwidth: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX10-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 ; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid @@ -1335,32 +1335,32 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_cttz_i8_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] +; GFX10-NEXT: global_load_ubyte v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 -; GFX10-NEXT: global_store_byte v1, v0, s[0:1] +; GFX10-NEXT: global_store_byte v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_cttz_i8_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s3 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s7 ; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 ; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_or_b32_e32 v1, 0x100, v0 -; GFX10-GISEL-NEXT: v_cmp_eq_u32_sdwa s2, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-GISEL-NEXT: v_cmp_eq_u32_sdwa s0, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, s2 -; GFX10-GISEL-NEXT: global_store_byte v2, v0, s[0:1] +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, s0 +; GFX10-GISEL-NEXT: global_store_byte v2, v0, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %valptr.gep = getelementptr i8, ptr addrspace(1) %valptr, i32 %tid @@ -1442,31 +1442,31 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_cttz_i16_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_or_b32_e32 v2, 0x10000, v1 ; GFX10-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0, v1 ; GFX10-NEXT: v_ffbl_b32_e32 v2, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo -; GFX10-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_cttz_i16_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX10-GISEL-NEXT: global_load_ushort v1, v0, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_or_b32_e32 v2, 0x10000, v1 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v2, v2 ; GFX10-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v2, 0xffff, vcc_lo -; GFX10-GISEL-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-GISEL-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm %val = load i16, ptr addrspace(1) %valptr %cttz = call i16 @llvm.cttz.i16(i16 %val, i1 false) nounwind readnone @@ -1542,23 +1542,23 @@ define amdgpu_kernel void @v_cttz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-LABEL: v_cttz_i7_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] +; GFX10-NEXT: global_load_ubyte v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 ; GFX10-NEXT: v_and_b32_e32 v0, 0x7f, v0 -; GFX10-NEXT: global_store_byte v1, v0, s[0:1] +; GFX10-NEXT: global_store_byte v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_cttz_i7_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s3 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s7 ; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 ; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo ; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off @@ -1570,7 +1570,7 @@ define amdgpu_kernel void @v_cttz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0x7f, vcc_lo ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0x7f, v0 -; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[0:1] +; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %valptr.gep = getelementptr i7, ptr addrspace(1) %valptr, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll index 392a44318b0a5..086d99916ba04 100644 --- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll @@ -121,14 +121,14 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[6:7] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid @@ -198,15 +198,15 @@ define amdgpu_kernel void @v_cttz_zero_undef_v2i32(ptr addrspace(1) noalias %out ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_v2i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 -; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr <2 x i32>, ptr addrspace(1) %valptr, i32 %tid @@ -282,17 +282,17 @@ define amdgpu_kernel void @v_cttz_zero_undef_v4i32(ptr addrspace(1) noalias %out ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_v4i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v2 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v3, v3 -; GFX9-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX9-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr <4 x i32>, ptr addrspace(1) %valptr, i32 %tid @@ -538,14 +538,14 @@ define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: s_cttz_zero_undef_i64_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-GISEL-NEXT: s_mov_b32 s5, 0 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_mov_b32 s1, 0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_ff1_i32_b64 s4, s[2:3] -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-GISEL-NEXT: s_ff1_i32_b64 s0, s[6:7] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) nounwind readnone %cttz_ret = icmp ne i64 %val, 0 @@ -622,16 +622,16 @@ define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i8_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v1 ; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc -; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %val = load i8, ptr addrspace(1) %arrayidx, align 1 %cttz = tail call i8 @llvm.cttz.i8(i8 %val, i1 true) nounwind readnone @@ -721,18 +721,18 @@ define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i16_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] -; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 +; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] +; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[6:7] offset:1 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v1 ; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc -; GFX9-GISEL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-GISEL-NEXT: global_store_short v0, v1, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %val = load i16, ptr addrspace(1) %arrayidx, align 1 %cttz = tail call i16 @llvm.cttz.i16(i16 %val, i1 true) nounwind readnone @@ -836,13 +836,13 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i32_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] -; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 -; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[2:3] offset:3 -; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[2:3] offset:2 +; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] +; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[6:7] offset:1 +; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[6:7] offset:3 +; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[6:7] offset:2 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1) @@ -853,7 +853,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) no ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v1 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %val = load i32, ptr addrspace(1) %arrayidx, align 1 %cttz = tail call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone @@ -1017,17 +1017,17 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i64_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v0, v1, s[2:3] -; GFX9-GISEL-NEXT: global_load_ubyte v2, v1, s[2:3] offset:1 -; GFX9-GISEL-NEXT: global_load_ubyte v3, v1, s[2:3] offset:2 -; GFX9-GISEL-NEXT: global_load_ubyte v4, v1, s[2:3] offset:3 -; GFX9-GISEL-NEXT: global_load_ubyte v5, v1, s[2:3] offset:4 -; GFX9-GISEL-NEXT: global_load_ubyte v6, v1, s[2:3] offset:5 -; GFX9-GISEL-NEXT: global_load_ubyte v7, v1, s[2:3] offset:6 -; GFX9-GISEL-NEXT: global_load_ubyte v8, v1, s[2:3] offset:7 +; GFX9-GISEL-NEXT: global_load_ubyte v0, v1, s[6:7] +; GFX9-GISEL-NEXT: global_load_ubyte v2, v1, s[6:7] offset:1 +; GFX9-GISEL-NEXT: global_load_ubyte v3, v1, s[6:7] offset:2 +; GFX9-GISEL-NEXT: global_load_ubyte v4, v1, s[6:7] offset:3 +; GFX9-GISEL-NEXT: global_load_ubyte v5, v1, s[6:7] offset:4 +; GFX9-GISEL-NEXT: global_load_ubyte v6, v1, s[6:7] offset:5 +; GFX9-GISEL-NEXT: global_load_ubyte v7, v1, s[6:7] offset:6 +; GFX9-GISEL-NEXT: global_load_ubyte v8, v1, s[6:7] offset:7 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(6) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v2, 8, v0 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(5) @@ -1048,7 +1048,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no ; GFX9-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; GFX9-GISEL-NEXT: v_min_u32_e32 v0, v0, v4 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 64, v0, vcc -; GFX9-GISEL-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] +; GFX9-GISEL-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %val = load i64, ptr addrspace(1) %arrayidx, align 1 %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) nounwind readnone @@ -1152,13 +1152,13 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; GFX9-GISEL-LABEL: v_cttz_i32_sel_eq_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] -; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 -; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[2:3] offset:3 -; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[2:3] offset:2 +; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] +; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[6:7] offset:1 +; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[6:7] offset:3 +; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[6:7] offset:2 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1) @@ -1170,7 +1170,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; GFX9-GISEL-NEXT: v_min_u32_e32 v2, 32, v2 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, v2, -1, vcc -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %val = load i32, ptr addrspace(1) %arrayidx, align 1 %ctlz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone @@ -1274,13 +1274,13 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; ; GFX9-GISEL-LABEL: v_cttz_i32_sel_ne_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] -; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 -; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[2:3] offset:3 -; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[2:3] offset:2 +; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] +; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[6:7] offset:1 +; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[6:7] offset:3 +; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[6:7] offset:2 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1) @@ -1292,7 +1292,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; GFX9-GISEL-NEXT: v_min_u32_e32 v2, 32, v2 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, -1, v2, vcc -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %val = load i32, ptr addrspace(1) %arrayidx, align 1 %ctlz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone @@ -1404,13 +1404,13 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX9-GISEL-LABEL: v_cttz_i32_sel_ne_bitwidth: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] -; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 -; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[2:3] offset:3 -; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[2:3] offset:2 +; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] +; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[6:7] offset:1 +; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[6:7] offset:3 +; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[6:7] offset:2 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1) @@ -1422,7 +1422,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; GFX9-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 32, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, -1, v1, vcc -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %val = load i32, ptr addrspace(1) %arrayidx, align 1 %ctlz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone @@ -1498,18 +1498,18 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX9-GISEL-LABEL: v_cttz_i8_sel_eq_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xff ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_or_b32_e32 v3, 0x100, v1 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v3, v3 ; GFX9-GISEL-NEXT: v_and_b32_e32 v3, 0xff, v3 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc -; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %val = load i8, ptr addrspace(1) %arrayidx, align 1 %ctlz = call i8 @llvm.cttz.i8(i8 %val, i1 false) nounwind readnone @@ -1597,12 +1597,12 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX9-GISEL-LABEL: v_cttz_i16_sel_eq_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] -; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 +; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] +; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[6:7] offset:1 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX9-GISEL-NEXT: v_or_b32_e32 v2, 0x10000, v1 @@ -1610,7 +1610,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-GISEL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-GISEL-NEXT: global_store_short v0, v1, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %val = load i16, ptr addrspace(1) %arrayidx, align 1 %ctlz = call i16 @llvm.cttz.i16(i16 %val, i1 false) nounwind readnone diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll index 097604d57803e..4226728dbe118 100644 --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -933,24 +933,24 @@ define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr add ; ; GFX10-LABEL: load_i8_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] +; GFX10-NEXT: global_load_ubyte v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: load_i8_to_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] +; GFX9-NEXT: global_load_ubyte v0, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: load_i8_to_f32: @@ -1013,28 +1013,28 @@ define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr ; ; GFX10-LABEL: load_v2i8_to_v2f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ushort v0, v0, s[2:3] +; GFX10-NEXT: global_load_ushort v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: load_v2i8_to_v2f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v0, v0, s[2:3] +; GFX9-NEXT: global_load_ushort v0, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: load_v2i8_to_v2f32: @@ -1102,30 +1102,30 @@ define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr ; ; GFX10-LABEL: load_v3i8_to_v3f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX10-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX10-NEXT: global_store_dwordx3 v3, v[0:2], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: load_v3i8_to_v3f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[2:3] +; GFX9-NEXT: global_load_dword v0, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX9-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX9-NEXT: global_store_dwordx3 v3, v[0:2], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: load_v3i8_to_v3f32: @@ -1194,32 +1194,32 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr ; ; GFX10-LABEL: load_v4i8_to_v4f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: load_v4i8_to_v4f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[2:3] +; GFX9-NEXT: global_load_dword v0, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 ; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: load_v4i8_to_v4f32: @@ -1312,15 +1312,15 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias ; ; GFX10-LABEL: load_v4i8_to_v4f32_unaligned: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x3 -; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] offset:3 -; GFX10-NEXT: global_load_ubyte v2, v0, s[2:3] offset:2 -; GFX10-NEXT: global_load_ubyte v4, v0, s[2:3] offset:1 -; GFX10-NEXT: global_load_ubyte v5, v0, s[2:3] +; GFX10-NEXT: global_load_ubyte v1, v0, s[6:7] offset:3 +; GFX10-NEXT: global_load_ubyte v2, v0, s[6:7] offset:2 +; GFX10-NEXT: global_load_ubyte v4, v0, s[6:7] offset:1 +; GFX10-NEXT: global_load_ubyte v5, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(3) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v3, v1 ; GFX10-NEXT: s_waitcnt vmcnt(2) @@ -1329,19 +1329,19 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v4 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v5 -; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: load_v4i8_to_v4f32_unaligned: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3] offset:3 -; GFX9-NEXT: global_load_ubyte v2, v0, s[2:3] offset:2 -; GFX9-NEXT: global_load_ubyte v4, v0, s[2:3] offset:1 -; GFX9-NEXT: global_load_ubyte v5, v0, s[2:3] +; GFX9-NEXT: global_load_ubyte v1, v0, s[6:7] offset:3 +; GFX9-NEXT: global_load_ubyte v2, v0, s[6:7] offset:2 +; GFX9-NEXT: global_load_ubyte v4, v0, s[6:7] offset:1 +; GFX9-NEXT: global_load_ubyte v5, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, v1 ; GFX9-NEXT: s_waitcnt vmcnt(2) @@ -1350,7 +1350,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v5 -; GFX9-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX9-NEXT: global_store_dwordx4 v6, v[0:3], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: load_v4i8_to_v4f32_unaligned: @@ -1643,12 +1643,12 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; ; GFX10-LABEL: load_v4i8_to_v4f32_2_uses: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff00, v0 @@ -1666,22 +1666,21 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX10-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] -; GFX10-NEXT: global_store_dword v4, v5, s[2:3] +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] +; GFX10-NEXT: global_store_dword v4, v5, s[6:7] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: load_v4i8_to_v4f32_2_uses: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_movk_i32 s4, 0xff00 ; GFX9-NEXT: v_mov_b32_e32 v6, 9 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v4, v0, s[0:1] -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX9-NEXT: s_movk_i32 s5, 0x900 +; GFX9-NEXT: s_movk_i32 s0, 0xff00 +; GFX9-NEXT: s_movk_i32 s1, 0x900 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v3, v4 ; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v2, v4 @@ -1689,17 +1688,16 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffffff00, v4 ; GFX9-NEXT: v_add_u16_e32 v8, 9, v4 -; GFX9-NEXT: v_and_b32_sdwa v9, v4, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v9, v4, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_add_u16_sdwa v4, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX9-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_or_b32_sdwa v0, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_or_b32_sdwa v1, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v0, 0x900, v0 -; GFX9-NEXT: v_add_u16_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u16_sdwa v1, v1, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX9-NEXT: global_store_dword v5, v0, s[2:3] +; GFX9-NEXT: global_store_dword v5, v0, s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: load_v4i8_to_v4f32_2_uses: @@ -1839,17 +1837,17 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr ; ; GFX10-LABEL: load_v7i8_to_v7f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x5 -; GFX10-NEXT: global_load_ubyte v4, v0, s[2:3] offset:6 -; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] offset:3 -; GFX10-NEXT: global_load_ubyte v2, v0, s[2:3] offset:2 -; GFX10-NEXT: global_load_ubyte v5, v0, s[2:3] offset:1 -; GFX10-NEXT: global_load_short_d16 v7, v0, s[2:3] offset:4 -; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] +; GFX10-NEXT: global_load_ubyte v4, v0, s[6:7] offset:6 +; GFX10-NEXT: global_load_ubyte v1, v0, s[6:7] offset:3 +; GFX10-NEXT: global_load_ubyte v2, v0, s[6:7] offset:2 +; GFX10-NEXT: global_load_ubyte v5, v0, s[6:7] offset:1 +; GFX10-NEXT: global_load_short_d16 v7, v0, s[6:7] offset:4 +; GFX10-NEXT: global_load_ubyte v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(5) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v6, v4 ; GFX10-NEXT: s_waitcnt vmcnt(4) @@ -1863,22 +1861,22 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v4, v7 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX10-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] offset:16 -; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX10-NEXT: global_store_dwordx3 v8, v[4:6], s[4:5] offset:16 +; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: load_v7i8_to_v7f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v10, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3] offset:6 -; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] offset:4 -; GFX9-NEXT: global_load_ubyte v3, v0, s[2:3] offset:3 -; GFX9-NEXT: global_load_ubyte v7, v0, s[2:3] offset:2 -; GFX9-NEXT: global_load_ubyte v8, v0, s[2:3] offset:1 -; GFX9-NEXT: global_load_ubyte v9, v0, s[2:3] +; GFX9-NEXT: global_load_ubyte v1, v0, s[6:7] offset:6 +; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] offset:4 +; GFX9-NEXT: global_load_ubyte v3, v0, s[6:7] offset:3 +; GFX9-NEXT: global_load_ubyte v7, v0, s[6:7] offset:2 +; GFX9-NEXT: global_load_ubyte v8, v0, s[6:7] offset:1 +; GFX9-NEXT: global_load_ubyte v9, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v6, v1 ; GFX9-NEXT: s_waitcnt vmcnt(4) @@ -1892,8 +1890,8 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, v8 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v9 -; GFX9-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] -; GFX9-NEXT: global_store_dwordx3 v10, v[4:6], s[0:1] offset:16 +; GFX9-NEXT: global_store_dwordx4 v10, v[0:3], s[4:5] +; GFX9-NEXT: global_store_dwordx3 v10, v[4:6], s[4:5] offset:16 ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: load_v7i8_to_v7f32: @@ -1990,11 +1988,11 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr ; ; GFX10-LABEL: load_v8i8_to_v8f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: v_mov_b32_e32 v10, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[8:9], v0, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[8:9], v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v7, v9 ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v6, v9 @@ -2004,17 +2002,17 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v8 ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v8 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v8 -; GFX10-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] offset:16 -; GFX10-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] +; GFX10-NEXT: global_store_dwordx4 v10, v[4:7], s[4:5] offset:16 +; GFX10-NEXT: global_store_dwordx4 v10, v[0:3], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: load_v8i8_to_v8f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v9, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[7:8], v0, s[2:3] +; GFX9-NEXT: global_load_dwordx2 v[7:8], v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v3, v7 ; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v2, v7 @@ -2024,8 +2022,8 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr ; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v6, v8 ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v5, v8 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v4, v8 -; GFX9-NEXT: global_store_dwordx4 v9, v[4:7], s[0:1] offset:16 -; GFX9-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1] +; GFX9-NEXT: global_store_dwordx4 v9, v[4:7], s[4:5] offset:16 +; GFX9-NEXT: global_store_dwordx4 v9, v[0:3], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: load_v8i8_to_v8f32: @@ -2098,28 +2096,28 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %ou ; ; GFX10-LABEL: i8_zext_inreg_i32_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v0, 2, v0 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: i8_zext_inreg_i32_to_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[2:3] +; GFX9-NEXT: global_load_dword v0, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v0, 2, v0 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: i8_zext_inreg_i32_to_f32: @@ -2184,26 +2182,26 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %ou ; ; GFX10-LABEL: i8_zext_inreg_hi1_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: i8_zext_inreg_hi1_to_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[2:3] +; GFX9-NEXT: global_load_dword v0, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: i8_zext_inreg_hi1_to_f32: @@ -2266,24 +2264,24 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr ; ; GFX10-LABEL: i8_zext_i32_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] +; GFX10-NEXT: global_load_ubyte v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: i8_zext_i32_to_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] +; GFX9-NEXT: global_load_ubyte v0, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: i8_zext_i32_to_f32: @@ -2369,15 +2367,15 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou ; ; GFX10-LABEL: v4i8_zext_v4i32_to_v4f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x3 -; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] offset:3 -; GFX10-NEXT: global_load_ubyte v2, v0, s[2:3] offset:2 -; GFX10-NEXT: global_load_ubyte v4, v0, s[2:3] offset:1 -; GFX10-NEXT: global_load_ubyte v5, v0, s[2:3] +; GFX10-NEXT: global_load_ubyte v1, v0, s[6:7] offset:3 +; GFX10-NEXT: global_load_ubyte v2, v0, s[6:7] offset:2 +; GFX10-NEXT: global_load_ubyte v4, v0, s[6:7] offset:1 +; GFX10-NEXT: global_load_ubyte v5, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(3) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v3, v1 ; GFX10-NEXT: s_waitcnt vmcnt(2) @@ -2386,19 +2384,19 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v4 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v5 -; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: v4i8_zext_v4i32_to_v4f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3] offset:3 -; GFX9-NEXT: global_load_ubyte v2, v0, s[2:3] offset:2 -; GFX9-NEXT: global_load_ubyte v4, v0, s[2:3] offset:1 -; GFX9-NEXT: global_load_ubyte v5, v0, s[2:3] +; GFX9-NEXT: global_load_ubyte v1, v0, s[6:7] offset:3 +; GFX9-NEXT: global_load_ubyte v2, v0, s[6:7] offset:2 +; GFX9-NEXT: global_load_ubyte v4, v0, s[6:7] offset:1 +; GFX9-NEXT: global_load_ubyte v5, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, v1 ; GFX9-NEXT: s_waitcnt vmcnt(2) @@ -2407,7 +2405,7 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v5 -; GFX9-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX9-NEXT: global_store_dwordx4 v6, v[0:3], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v4i8_zext_v4i32_to_v4f32: @@ -2479,26 +2477,26 @@ define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, p ; ; GFX10-LABEL: extract_byte0_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: extract_byte0_to_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[2:3] +; GFX9-NEXT: global_load_dword v0, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: extract_byte0_to_f32: @@ -2560,26 +2558,26 @@ define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, p ; ; GFX10-LABEL: extract_byte1_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: extract_byte1_to_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[2:3] +; GFX9-NEXT: global_load_dword v0, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: extract_byte1_to_f32: @@ -2642,26 +2640,26 @@ define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, p ; ; GFX10-LABEL: extract_byte2_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: extract_byte2_to_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[2:3] +; GFX9-NEXT: global_load_dword v0, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: extract_byte2_to_f32: @@ -2724,26 +2722,26 @@ define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, p ; ; GFX10-LABEL: extract_byte3_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: extract_byte3_to_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[2:3] +; GFX9-NEXT: global_load_dword v0, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: extract_byte3_to_f32: @@ -2825,16 +2823,16 @@ define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addr ; ; GFX9-LABEL: cvt_ubyte0_or_multiuse: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[0:1] +; GFX9-NEXT: global_load_dword v0, v0, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_e32 v0, 0x80000001, v0 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, v0 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX9-NEXT: global_store_dword v1, v0, s[2:3] +; GFX9-NEXT: global_store_dword v1, v0, s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: cvt_ubyte0_or_multiuse: diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll index 8f31bb1fe0a81..739fff5084135 100644 --- a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll +++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll @@ -376,22 +376,22 @@ define amdgpu_kernel void @uniform_vec_i16_LH(ptr addrspace(1) %out, i16 %a, i32 ; ; GFX9-LABEL: uniform_vec_i16_LH: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_pack_lh_b32_b16 s2, s2, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_pack_lh_b32_b16 s0, s6, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX906-LABEL: uniform_vec_i16_LH: ; GFX906: ; %bb.0: -; GFX906-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: s_pack_lh_b32_b16 s2, s2, s3 -; GFX906-NEXT: v_mov_b32_e32 v1, s2 -; GFX906-NEXT: global_store_dword v0, v1, s[0:1] +; GFX906-NEXT: s_pack_lh_b32_b16 s0, s6, s7 +; GFX906-NEXT: v_mov_b32_e32 v1, s0 +; GFX906-NEXT: global_store_dword v0, v1, s[4:5] ; GFX906-NEXT: s_endpgm ; ; GFX11-LABEL: uniform_vec_i16_LH: @@ -466,22 +466,22 @@ define amdgpu_kernel void @uniform_vec_i16_HH(ptr addrspace(1) %out, i32 %a, i32 ; ; GFX9-LABEL: uniform_vec_i16_HH: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_pack_hh_b32_b16 s2, s2, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_pack_hh_b32_b16 s0, s6, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX906-LABEL: uniform_vec_i16_HH: ; GFX906: ; %bb.0: -; GFX906-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: s_pack_hh_b32_b16 s2, s2, s3 -; GFX906-NEXT: v_mov_b32_e32 v1, s2 -; GFX906-NEXT: global_store_dword v0, v1, s[0:1] +; GFX906-NEXT: s_pack_hh_b32_b16 s0, s6, s7 +; GFX906-NEXT: v_mov_b32_e32 v1, s0 +; GFX906-NEXT: global_store_dword v0, v1, s[4:5] ; GFX906-NEXT: s_endpgm ; ; GFX11-LABEL: uniform_vec_i16_HH: diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll index b8936911f0576..ae280c5a443e1 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll @@ -1831,21 +1831,21 @@ define amdgpu_kernel void @s_copysign_v2f16(ptr addrspace(1) %arg_out, <2 x half ; ; GFX9-LABEL: s_copysign_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_movk_i32 s0, 0x7fff ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-NEXT: s_lshr_b32 s2, s2, 16 -; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: s_lshr_b32 s1, s7, 16 +; GFX9-NEXT: s_lshr_b32 s2, s6, 16 +; GFX9-NEXT: v_bfi_b32 v1, s0, v1, v2 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_bfi_b32 v2, s4, v2, v3 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_bfi_b32 v2, s0, v2, v3 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_copysign_v2f16: diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.ll b/llvm/test/CodeGen/AMDGPU/fdiv.ll index 0468175c5df50..6c5b2917855fc 100644 --- a/llvm/test/CodeGen/AMDGPU/fdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv.ll @@ -118,10 +118,10 @@ define amdgpu_kernel void @s_fdiv_f32_ninf(ptr addrspace(1) %out, float %a, floa ; ; GFX10-LABEL: s_fdiv_f32_ninf: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s4, s3, s3, s2 -; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s2, s3, s2 +; GFX10-NEXT: v_div_scale_f32 v0, s0, s7, s7, s6 +; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s6, s7, s6 ; GFX10-NEXT: v_rcp_f32_e32 v1, v0 ; GFX10-NEXT: s_denorm_mode 15 ; GFX10-NEXT: v_fma_f32 v3, -v0, v1, 1.0 @@ -133,8 +133,8 @@ define amdgpu_kernel void @s_fdiv_f32_ninf(ptr addrspace(1) %out, float %a, floa ; GFX10-NEXT: s_denorm_mode 12 ; GFX10-NEXT: v_div_fmas_f32 v0, v0, v1, v3 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_div_fixup_f32 v0, v0, s3, s2 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_div_fixup_f32 v0, v0, s7, s6 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_f32_ninf: @@ -275,21 +275,21 @@ define amdgpu_kernel void @s_fdiv_f32_ieee(ptr addrspace(1) %out, float %a, floa ; ; GFX10-LABEL: s_fdiv_f32_ieee: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s4, s3, s3, s2 +; GFX10-NEXT: v_div_scale_f32 v0, s0, s7, s7, s6 ; GFX10-NEXT: v_rcp_f32_e32 v1, v0 ; GFX10-NEXT: v_fma_f32 v2, -v0, v1, 1.0 ; GFX10-NEXT: v_fmac_f32_e32 v1, v2, v1 -; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s2, s3, s2 +; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s6, s7, s6 ; GFX10-NEXT: v_mul_f32_e32 v3, v2, v1 ; GFX10-NEXT: v_fma_f32 v4, -v0, v3, v2 ; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v1 ; GFX10-NEXT: v_fma_f32 v0, -v0, v3, v2 ; GFX10-NEXT: v_div_fmas_f32 v0, v0, v1, v3 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_div_fixup_f32 v0, v0, s3, s2 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_div_fixup_f32 v0, v0, s7, s6 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_f32_ieee: @@ -370,16 +370,16 @@ define amdgpu_kernel void @s_fdiv_25ulp_f32(ptr addrspace(1) %out, float %a, flo ; ; GFX10-LABEL: s_fdiv_25ulp_f32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cmp_lt_f32_e64 s4, 0x6f800000, |s3| -; GFX10-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x2f800000, s4 -; GFX10-NEXT: v_mul_f32_e32 v1, s3, v0 +; GFX10-NEXT: v_cmp_lt_f32_e64 s0, 0x6f800000, |s7| +; GFX10-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x2f800000, s0 +; GFX10-NEXT: v_mul_f32_e32 v1, s7, v0 ; GFX10-NEXT: v_rcp_f32_e32 v1, v1 -; GFX10-NEXT: v_mul_f32_e32 v1, s2, v1 +; GFX10-NEXT: v_mul_f32_e32 v1, s6, v1 ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX10-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-NEXT: global_store_dword v2, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_25ulp_f32: @@ -482,18 +482,18 @@ define amdgpu_kernel void @s_fdiv_25ulp_ieee_f32(ptr addrspace(1) %out, float %a ; ; GFX10-LABEL: s_fdiv_25ulp_ieee_f32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_frexp_mant_f32_e32 v0, s3 -; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v1, s3 -; GFX10-NEXT: v_frexp_mant_f32_e32 v2, s2 -; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v3, s2 +; GFX10-NEXT: v_frexp_mant_f32_e32 v0, s7 +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v1, s7 +; GFX10-NEXT: v_frexp_mant_f32_e32 v2, s6 +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v3, s6 ; GFX10-NEXT: v_rcp_f32_e32 v0, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, v3, v1 ; GFX10-NEXT: v_mul_f32_e32 v0, v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX10-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-NEXT: global_store_dword v2, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_25ulp_ieee_f32: @@ -559,12 +559,12 @@ define amdgpu_kernel void @s_fdiv_fast_ieee_f32(ptr addrspace(1) %out, float %a, ; ; GFX10-LABEL: s_fdiv_fast_ieee_f32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_rcp_f32_e32 v0, s3 -; GFX10-NEXT: v_mul_f32_e32 v0, s2, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_rcp_f32_e32 v0, s7 +; GFX10-NEXT: v_mul_f32_e32 v0, s6, v0 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_fast_ieee_f32: @@ -623,12 +623,12 @@ define amdgpu_kernel void @s_fdiv_f32_fast_math(ptr addrspace(1) %out, float %a, ; ; GFX10-LABEL: s_fdiv_f32_fast_math: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_rcp_f32_e32 v0, s3 -; GFX10-NEXT: v_mul_f32_e32 v0, s2, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_rcp_f32_e32 v0, s7 +; GFX10-NEXT: v_mul_f32_e32 v0, s6, v0 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_f32_fast_math: @@ -687,12 +687,12 @@ define amdgpu_kernel void @s_fdiv_ulp25_f32_fast_math(ptr addrspace(1) %out, flo ; ; GFX10-LABEL: s_fdiv_ulp25_f32_fast_math: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_rcp_f32_e32 v0, s3 -; GFX10-NEXT: v_mul_f32_e32 v0, s2, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_rcp_f32_e32 v0, s7 +; GFX10-NEXT: v_mul_f32_e32 v0, s6, v0 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_ulp25_f32_fast_math: @@ -829,10 +829,10 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_daz(ptr addrspace(1) %out, float %a, ; ; GFX10-LABEL: s_fdiv_f32_arcp_daz: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s4, s3, s3, s2 -; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s2, s3, s2 +; GFX10-NEXT: v_div_scale_f32 v0, s0, s7, s7, s6 +; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s6, s7, s6 ; GFX10-NEXT: v_rcp_f32_e32 v1, v0 ; GFX10-NEXT: s_denorm_mode 15 ; GFX10-NEXT: v_fma_f32 v3, -v0, v1, 1.0 @@ -844,8 +844,8 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_daz(ptr addrspace(1) %out, float %a, ; GFX10-NEXT: s_denorm_mode 12 ; GFX10-NEXT: v_div_fmas_f32 v0, v0, v1, v3 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_div_fixup_f32 v0, v0, s3, s2 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_div_fixup_f32 v0, v0, s7, s6 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_f32_arcp_daz: @@ -916,12 +916,12 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_ninf(ptr addrspace(1) %out, float %a, ; ; GFX10-LABEL: s_fdiv_f32_arcp_ninf: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_rcp_f32_e32 v0, s3 -; GFX10-NEXT: v_mul_f32_e32 v0, s2, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_rcp_f32_e32 v0, s7 +; GFX10-NEXT: v_mul_f32_e32 v0, s6, v0 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_f32_arcp_ninf: diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll index e44572985e6d2..26714dcc6dfac 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll @@ -4315,12 +4315,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr %out, i32 %in, i32 %old ; ; GCN3-LABEL: atomic_cmpxchg_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v3, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 ; GCN3-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -4570,12 +4570,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32(ptr %out, i32 %in, i32 %old) { ; ; GCN3-LABEL: atomic_cmpxchg_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v3, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 ; GCN3-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll index 5bd527149572e..66aacd7062a6d 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll @@ -3883,13 +3883,13 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 % ; ; GCN3-LABEL: atomic_max_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_ashr_i32 s5, s3, 31 -; GCN3-NEXT: s_mov_b32 s4, s3 -; GCN3-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; GCN3-NEXT: s_add_u32 s0, s0, s4 -; GCN3-NEXT: s_addc_u32 s1, s1, s5 +; GCN3-NEXT: s_ashr_i32 s1, s7, 31 +; GCN3-NEXT: s_mov_b32 s0, s7 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_add_u32 s0, s4, s0 +; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 @@ -3897,7 +3897,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN3-NEXT: .LBB88_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_max_i32_e32 v2, s2, v3 +; GCN3-NEXT: v_max_i32_e32 v2, s6, v3 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -4085,13 +4085,13 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index) ; ; GCN3-LABEL: atomic_max_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_ashr_i32 s5, s3, 31 -; GCN3-NEXT: s_mov_b32 s4, s3 -; GCN3-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; GCN3-NEXT: s_add_u32 s0, s0, s4 -; GCN3-NEXT: s_addc_u32 s1, s1, s5 +; GCN3-NEXT: s_ashr_i32 s1, s7, 31 +; GCN3-NEXT: s_mov_b32 s0, s7 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_add_u32 s0, s4, s0 +; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v3, v[0:1] @@ -4099,7 +4099,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index) ; GCN3-NEXT: .LBB90_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_max_i32_e32 v2, s2, v3 +; GCN3-NEXT: v_max_i32_e32 v2, s6, v3 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -5026,13 +5026,13 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32 ; ; GCN3-LABEL: atomic_umax_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_ashr_i32 s5, s3, 31 -; GCN3-NEXT: s_mov_b32 s4, s3 -; GCN3-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; GCN3-NEXT: s_add_u32 s0, s0, s4 -; GCN3-NEXT: s_addc_u32 s1, s1, s5 +; GCN3-NEXT: s_ashr_i32 s1, s7, 31 +; GCN3-NEXT: s_mov_b32 s0, s7 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_add_u32 s0, s4, s0 +; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 @@ -5040,7 +5040,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32 ; GCN3-NEXT: .LBB102_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_max_u32_e32 v2, s2, v3 +; GCN3-NEXT: v_max_u32_e32 v2, s6, v3 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -6820,13 +6820,13 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 % ; ; GCN3-LABEL: atomic_min_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_ashr_i32 s5, s3, 31 -; GCN3-NEXT: s_mov_b32 s4, s3 -; GCN3-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; GCN3-NEXT: s_add_u32 s0, s0, s4 -; GCN3-NEXT: s_addc_u32 s1, s1, s5 +; GCN3-NEXT: s_ashr_i32 s1, s7, 31 +; GCN3-NEXT: s_mov_b32 s0, s7 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_add_u32 s0, s4, s0 +; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 @@ -6834,7 +6834,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN3-NEXT: .LBB125_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_min_i32_e32 v2, s2, v3 +; GCN3-NEXT: v_min_i32_e32 v2, s6, v3 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol diff --git a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll index 718be90eb75fc..9943976dd86da 100644 --- a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll @@ -1389,49 +1389,49 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out, ; ; GFX10-FLUSH-LABEL: mad_sub_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) ; GFX10-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX10-FLUSH-NEXT: v_sub_f16_e32 v1, v1, v3 -; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-FLUSH-NEXT: s_endpgm ; ; GFX10-DENORM-STRICT-LABEL: mad_sub_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-STRICT-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX10-DENORM-STRICT-NEXT: v_sub_f16_e32 v1, v1, v3 -; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-DENORM-STRICT-NEXT: s_endpgm ; ; GFX10-DENORM-CONTRACT-LABEL: mad_sub_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: v_fma_f16 v1, v1, v2, -v3 -; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-DENORM-CONTRACT-NEXT: s_endpgm ; ; GFX11-FLUSH-LABEL: mad_sub_f16: @@ -1558,49 +1558,49 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o ; ; GFX10-FLUSH-LABEL: mad_sub_inv_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) ; GFX10-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX10-FLUSH-NEXT: v_sub_f16_e32 v1, v3, v1 -; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-FLUSH-NEXT: s_endpgm ; ; GFX10-DENORM-STRICT-LABEL: mad_sub_inv_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-STRICT-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX10-DENORM-STRICT-NEXT: v_sub_f16_e32 v1, v3, v1 -; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-DENORM-STRICT-NEXT: s_endpgm ; ; GFX10-DENORM-CONTRACT-LABEL: mad_sub_inv_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: v_fma_f16 v1, -v1, v2, v3 -; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-DENORM-CONTRACT-NEXT: s_endpgm ; ; GFX11-FLUSH-LABEL: mad_sub_inv_f16: @@ -1727,49 +1727,49 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture % ; ; GFX10-FLUSH-LABEL: mad_sub_fabs_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) ; GFX10-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX10-FLUSH-NEXT: v_sub_f16_e64 v1, v1, |v3| -; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-FLUSH-NEXT: s_endpgm ; ; GFX10-DENORM-STRICT-LABEL: mad_sub_fabs_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-STRICT-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX10-DENORM-STRICT-NEXT: v_sub_f16_e64 v1, v1, |v3| -; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-DENORM-STRICT-NEXT: s_endpgm ; ; GFX10-DENORM-CONTRACT-LABEL: mad_sub_fabs_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: v_fma_f16 v1, v1, v2, -|v3| -; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-DENORM-CONTRACT-NEXT: s_endpgm ; ; GFX11-FLUSH-LABEL: mad_sub_fabs_f16: @@ -1897,49 +1897,49 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu ; ; GFX10-FLUSH-LABEL: mad_sub_fabs_inv_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) ; GFX10-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX10-FLUSH-NEXT: v_sub_f16_e64 v1, |v3|, v1 -; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-FLUSH-NEXT: s_endpgm ; ; GFX10-DENORM-STRICT-LABEL: mad_sub_fabs_inv_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-STRICT-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX10-DENORM-STRICT-NEXT: v_sub_f16_e64 v1, |v3|, v1 -; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-DENORM-STRICT-NEXT: s_endpgm ; ; GFX10-DENORM-CONTRACT-LABEL: mad_sub_fabs_inv_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: v_fma_f16 v1, -v1, v2, |v3| -; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-DENORM-CONTRACT-NEXT: s_endpgm ; ; GFX11-FLUSH-LABEL: mad_sub_fabs_inv_f16: @@ -2067,49 +2067,49 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o ; ; GFX10-FLUSH-LABEL: neg_neg_mad_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) ; GFX10-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v3, v1 -; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-FLUSH-NEXT: s_endpgm ; ; GFX10-DENORM-STRICT-LABEL: neg_neg_mad_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-STRICT-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX10-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v3, v1 -; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-DENORM-STRICT-NEXT: s_endpgm ; ; GFX10-DENORM-CONTRACT-LABEL: neg_neg_mad_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: v_fmac_f16_e32 v3, v1, v2 -; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v3, s[0:1] +; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v3, s[4:5] ; GFX10-DENORM-CONTRACT-NEXT: s_endpgm ; ; GFX11-FLUSH-LABEL: neg_neg_mad_f16: @@ -2238,49 +2238,49 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture % ; ; GFX10-FLUSH-LABEL: mad_fabs_sub_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) ; GFX10-FLUSH-NEXT: v_mul_f16_e64 v1, v1, |v2| ; GFX10-FLUSH-NEXT: v_sub_f16_e32 v1, v1, v3 -; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-FLUSH-NEXT: s_endpgm ; ; GFX10-DENORM-STRICT-LABEL: mad_fabs_sub_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-STRICT-NEXT: v_mul_f16_e64 v1, v1, |v2| ; GFX10-DENORM-STRICT-NEXT: v_sub_f16_e32 v1, v1, v3 -; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-DENORM-STRICT-NEXT: s_endpgm ; ; GFX10-DENORM-CONTRACT-LABEL: mad_fabs_sub_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: v_fma_f16 v1, v1, |v2|, -v3 -; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-DENORM-CONTRACT-NEXT: s_endpgm ; ; GFX11-FLUSH-LABEL: mad_fabs_sub_f16: diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll index 5761c198e20ba..f55e9f4821b47 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll @@ -316,10 +316,10 @@ define <2 x i16> @global_atomic_fadd_v2bf16_rtn(ptr addrspace(1) %ptr, <2 x i16> define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, <2 x half> %data) { ; GFX940-LABEL: local_atomic_fadd_v2f16_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s0 -; GFX940-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-NEXT: v_mov_b32_e32 v1, s3 ; GFX940-NEXT: ds_pk_add_f16 v0, v1 ; GFX940-NEXT: s_endpgm ; @@ -359,10 +359,10 @@ define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half> define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, <2 x i16> %data) { ; GFX940-LABEL: local_atomic_fadd_v2bf16_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s0 -; GFX940-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-NEXT: v_mov_b32_e32 v1, s3 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: ds_pk_add_bf16 v0, v1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/fshl.ll b/llvm/test/CodeGen/AMDGPU/fshl.ll index 4ea3323a9dbfc..3c4087fe391b6 100644 --- a/llvm/test/CodeGen/AMDGPU/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/fshl.ll @@ -137,12 +137,12 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX9-LABEL: fshl_i32_imm: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_alignbit_b32 v1, s2, v1, 25 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_alignbit_b32 v1, s6, v1, 25 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshl_i32_imm: @@ -159,11 +159,11 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX10-LABEL: fshl_i32_imm: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v1, s2, s3, 25 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: v_alignbit_b32 v1, s6, s7, 25 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshl_i32_imm: @@ -734,15 +734,15 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) { ; ; GFX9-LABEL: orxor2or1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s4, s2, 7 -; GFX9-NEXT: s_or_b32 s4, s3, s4 -; GFX9-NEXT: s_cmp_eq_u32 s4, 0 -; GFX9-NEXT: s_cselect_b32 s2, s2, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_lshl_b32 s0, s6, 7 +; GFX9-NEXT: s_or_b32 s0, s7, s0 +; GFX9-NEXT: s_cmp_eq_u32 s0, 0 +; GFX9-NEXT: s_cselect_b32 s0, s6, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: orxor2or1: @@ -761,15 +761,15 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) { ; ; GFX10-LABEL: orxor2or1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_lshl_b32 s4, s2, 7 -; GFX10-NEXT: s_or_b32 s4, s3, s4 -; GFX10-NEXT: s_cmp_eq_u32 s4, 0 -; GFX10-NEXT: s_cselect_b32 s2, s2, s3 -; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_lshl_b32 s0, s6, 7 +; GFX10-NEXT: s_or_b32 s0, s7, s0 +; GFX10-NEXT: s_cmp_eq_u32 s0, 0 +; GFX10-NEXT: s_cselect_b32 s0, s6, s7 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: orxor2or1: diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll index e8310e73f9a47..e8377763e4be2 100644 --- a/llvm/test/CodeGen/AMDGPU/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/fshr.ll @@ -129,12 +129,12 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX9-LABEL: fshr_i32_imm: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_alignbit_b32 v1, s2, v1, 7 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_alignbit_b32 v1, s6, v1, 7 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshr_i32_imm: @@ -151,11 +151,11 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX10-LABEL: fshr_i32_imm: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v1, s2, s3, 7 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: v_alignbit_b32 v1, s6, s7, 7 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshr_i32_imm: diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics.ll b/llvm/test/CodeGen/AMDGPU/global_atomics.ll index dac3a3db7b450..d4398e5367c7f 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics.ll @@ -4155,12 +4155,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr addrspace(1) %out, i32 ; ; GFX9-LABEL: atomic_cmpxchg_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -4406,12 +4406,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32(ptr addrspace(1) %out, i32 %in, i3 ; ; GFX9-LABEL: atomic_cmpxchg_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll index 516c92f1640ea..1fa7c52a68802 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll @@ -4679,28 +4679,28 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_max_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s5, s3, 31 -; GFX9-NEXT: s_mov_b32 s4, s3 -; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; GFX9-NEXT: s_add_u32 s0, s0, s4 -; GFX9-NEXT: s_addc_u32 s1, s1, s5 -; GFX9-NEXT: s_load_dword s3, s[0:1], 0x10 -; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: s_ashr_i32 s1, s7, 31 +; GFX9-NEXT: s_mov_b32 s0, s7 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: .LBB91_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_max_i32_e32 v0, s2, v1 +; GFX9-NEXT: v_max_i32_e32 v0, s6, v1 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB91_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm @@ -4890,28 +4890,28 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; GFX9-LABEL: atomic_max_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s5, s3, 31 -; GFX9-NEXT: s_mov_b32 s4, s3 -; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; GFX9-NEXT: s_add_u32 s0, s0, s4 -; GFX9-NEXT: s_addc_u32 s1, s1, s5 -; GFX9-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: s_ashr_i32 s1, s7, 31 +; GFX9-NEXT: s_mov_b32 s0, s7 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: .LBB93_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_max_i32_e32 v0, s2, v1 +; GFX9-NEXT: v_max_i32_e32 v0, s6, v1 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB93_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm @@ -5932,28 +5932,28 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, ; ; GFX9-LABEL: atomic_umax_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s5, s3, 31 -; GFX9-NEXT: s_mov_b32 s4, s3 -; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; GFX9-NEXT: s_add_u32 s0, s0, s4 -; GFX9-NEXT: s_addc_u32 s1, s1, s5 -; GFX9-NEXT: s_load_dword s3, s[0:1], 0x10 -; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: s_ashr_i32 s1, s7, 31 +; GFX9-NEXT: s_mov_b32 s0, s7 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: .LBB105_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_max_u32_e32 v0, s2, v1 +; GFX9-NEXT: v_max_u32_e32 v0, s6, v1 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB105_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm @@ -7923,28 +7923,28 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_min_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s5, s3, 31 -; GFX9-NEXT: s_mov_b32 s4, s3 -; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; GFX9-NEXT: s_add_u32 s0, s0, s4 -; GFX9-NEXT: s_addc_u32 s1, s1, s5 -; GFX9-NEXT: s_load_dword s3, s[0:1], 0x10 -; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: s_ashr_i32 s1, s7, 31 +; GFX9-NEXT: s_mov_b32 s0, s7 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: .LBB128_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_min_i32_e32 v0, s2, v1 +; GFX9-NEXT: v_min_i32_e32 v0, s6, v1 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB128_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll index df03e89370377..d5265e364a17e 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll @@ -436,121 +436,121 @@ entry: define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; GFX9-LABEL: udiv_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX9-NEXT: s_sub_i32 s4, 0, s3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX9-NEXT: s_sub_i32 s0, 0, s7 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s5, v0 -; GFX9-NEXT: s_mul_i32 s4, s4, s5 -; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4 -; GFX9-NEXT: s_add_i32 s5, s5, s4 -; GFX9-NEXT: s_mul_hi_u32 s4, s2, s5 -; GFX9-NEXT: s_mul_i32 s5, s4, s3 -; GFX9-NEXT: s_sub_i32 s2, s2, s5 -; GFX9-NEXT: s_add_i32 s6, s4, 1 -; GFX9-NEXT: s_sub_i32 s5, s2, s3 -; GFX9-NEXT: s_cmp_ge_u32 s2, s3 -; GFX9-NEXT: s_cselect_b32 s4, s6, s4 -; GFX9-NEXT: s_cselect_b32 s2, s5, s2 -; GFX9-NEXT: s_add_i32 s5, s4, 1 -; GFX9-NEXT: s_cmp_ge_u32 s2, s3 -; GFX9-NEXT: s_cselect_b32 s2, s5, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: v_readfirstlane_b32 s1, v0 +; GFX9-NEXT: s_mul_i32 s0, s0, s1 +; GFX9-NEXT: s_mul_hi_u32 s0, s1, s0 +; GFX9-NEXT: s_add_i32 s1, s1, s0 +; GFX9-NEXT: s_mul_hi_u32 s0, s6, s1 +; GFX9-NEXT: s_mul_i32 s1, s0, s7 +; GFX9-NEXT: s_sub_i32 s1, s6, s1 +; GFX9-NEXT: s_add_i32 s2, s0, 1 +; GFX9-NEXT: s_sub_i32 s3, s1, s7 +; GFX9-NEXT: s_cmp_ge_u32 s1, s7 +; GFX9-NEXT: s_cselect_b32 s0, s2, s0 +; GFX9-NEXT: s_cselect_b32 s1, s3, s1 +; GFX9-NEXT: s_add_i32 s2, s0, 1 +; GFX9-NEXT: s_cmp_ge_u32 s1, s7 +; GFX9-NEXT: s_cselect_b32 s0, s2, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; ; GFX90A-LABEL: udiv_i32: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, 0 -; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX90A-NEXT: s_sub_i32 s4, 0, s3 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX90A-NEXT: s_sub_i32 s0, 0, s7 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX90A-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX90A-NEXT: v_readfirstlane_b32 s5, v0 -; GFX90A-NEXT: s_mul_i32 s4, s4, s5 -; GFX90A-NEXT: s_mul_hi_u32 s4, s5, s4 -; GFX90A-NEXT: s_add_i32 s5, s5, s4 -; GFX90A-NEXT: s_mul_hi_u32 s4, s2, s5 -; GFX90A-NEXT: s_mul_i32 s5, s4, s3 -; GFX90A-NEXT: s_sub_i32 s2, s2, s5 -; GFX90A-NEXT: s_add_i32 s6, s4, 1 -; GFX90A-NEXT: s_sub_i32 s5, s2, s3 -; GFX90A-NEXT: s_cmp_ge_u32 s2, s3 -; GFX90A-NEXT: s_cselect_b32 s4, s6, s4 -; GFX90A-NEXT: s_cselect_b32 s2, s5, s2 -; GFX90A-NEXT: s_add_i32 s5, s4, 1 -; GFX90A-NEXT: s_cmp_ge_u32 s2, s3 -; GFX90A-NEXT: s_cselect_b32 s2, s5, s4 -; GFX90A-NEXT: v_mov_b32_e32 v0, s2 -; GFX90A-NEXT: global_store_dword v1, v0, s[0:1] +; GFX90A-NEXT: v_readfirstlane_b32 s1, v0 +; GFX90A-NEXT: s_mul_i32 s0, s0, s1 +; GFX90A-NEXT: s_mul_hi_u32 s0, s1, s0 +; GFX90A-NEXT: s_add_i32 s1, s1, s0 +; GFX90A-NEXT: s_mul_hi_u32 s0, s6, s1 +; GFX90A-NEXT: s_mul_i32 s1, s0, s7 +; GFX90A-NEXT: s_sub_i32 s1, s6, s1 +; GFX90A-NEXT: s_add_i32 s2, s0, 1 +; GFX90A-NEXT: s_sub_i32 s3, s1, s7 +; GFX90A-NEXT: s_cmp_ge_u32 s1, s7 +; GFX90A-NEXT: s_cselect_b32 s0, s2, s0 +; GFX90A-NEXT: s_cselect_b32 s1, s3, s1 +; GFX90A-NEXT: s_add_i32 s2, s0, 1 +; GFX90A-NEXT: s_cmp_ge_u32 s1, s7 +; GFX90A-NEXT: s_cselect_b32 s0, s2, s0 +; GFX90A-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NEXT: global_store_dword v1, v0, s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_endpgm ; ; GFX10-LABEL: udiv_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX10-NEXT: s_sub_i32 s5, 0, s3 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX10-NEXT: s_sub_i32 s1, 0, s7 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX10-NEXT: v_readfirstlane_b32 s4, v0 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_mul_i32 s5, s5, s4 -; GFX10-NEXT: s_mul_hi_u32 s5, s4, s5 -; GFX10-NEXT: s_add_i32 s4, s4, s5 -; GFX10-NEXT: s_mul_hi_u32 s4, s2, s4 -; GFX10-NEXT: s_mul_i32 s5, s4, s3 -; GFX10-NEXT: s_sub_i32 s2, s2, s5 -; GFX10-NEXT: s_add_i32 s5, s4, 1 -; GFX10-NEXT: s_sub_i32 s6, s2, s3 -; GFX10-NEXT: s_cmp_ge_u32 s2, s3 -; GFX10-NEXT: s_cselect_b32 s4, s5, s4 -; GFX10-NEXT: s_cselect_b32 s2, s6, s2 -; GFX10-NEXT: s_add_i32 s5, s4, 1 -; GFX10-NEXT: s_cmp_ge_u32 s2, s3 -; GFX10-NEXT: s_cselect_b32 s2, s5, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_mul_i32 s1, s1, s0 +; GFX10-NEXT: s_mul_hi_u32 s1, s0, s1 +; GFX10-NEXT: s_add_i32 s0, s0, s1 +; GFX10-NEXT: s_mul_hi_u32 s0, s6, s0 +; GFX10-NEXT: s_mul_i32 s1, s0, s7 +; GFX10-NEXT: s_add_i32 s2, s0, 1 +; GFX10-NEXT: s_sub_i32 s1, s6, s1 +; GFX10-NEXT: s_sub_i32 s3, s1, s7 +; GFX10-NEXT: s_cmp_ge_u32 s1, s7 +; GFX10-NEXT: s_cselect_b32 s0, s2, s0 +; GFX10-NEXT: s_cselect_b32 s1, s3, s1 +; GFX10-NEXT: s_add_i32 s2, s0, 1 +; GFX10-NEXT: s_cmp_ge_u32 s1, s7 +; GFX10-NEXT: s_cselect_b32 s0, s2, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_endpgm ; ; GFX9-FLATSCR-LABEL: udiv_i32: ; GFX9-FLATSCR: ; %bb.0: -; GFX9-FLATSCR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-FLATSCR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-FLATSCR-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX9-FLATSCR-NEXT: s_sub_i32 s4, 0, s3 +; GFX9-FLATSCR-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX9-FLATSCR-NEXT: s_sub_i32 s0, 0, s7 ; GFX9-FLATSCR-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-FLATSCR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-FLATSCR-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-FLATSCR-NEXT: v_readfirstlane_b32 s5, v0 -; GFX9-FLATSCR-NEXT: s_mul_i32 s4, s4, s5 -; GFX9-FLATSCR-NEXT: s_mul_hi_u32 s4, s5, s4 -; GFX9-FLATSCR-NEXT: s_add_i32 s5, s5, s4 -; GFX9-FLATSCR-NEXT: s_mul_hi_u32 s4, s2, s5 -; GFX9-FLATSCR-NEXT: s_mul_i32 s5, s4, s3 -; GFX9-FLATSCR-NEXT: s_sub_i32 s2, s2, s5 -; GFX9-FLATSCR-NEXT: s_add_i32 s6, s4, 1 -; GFX9-FLATSCR-NEXT: s_sub_i32 s5, s2, s3 -; GFX9-FLATSCR-NEXT: s_cmp_ge_u32 s2, s3 -; GFX9-FLATSCR-NEXT: s_cselect_b32 s4, s6, s4 -; GFX9-FLATSCR-NEXT: s_cselect_b32 s2, s5, s2 -; GFX9-FLATSCR-NEXT: s_add_i32 s5, s4, 1 -; GFX9-FLATSCR-NEXT: s_cmp_ge_u32 s2, s3 -; GFX9-FLATSCR-NEXT: s_cselect_b32 s2, s5, s4 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-FLATSCR-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-FLATSCR-NEXT: v_readfirstlane_b32 s1, v0 +; GFX9-FLATSCR-NEXT: s_mul_i32 s0, s0, s1 +; GFX9-FLATSCR-NEXT: s_mul_hi_u32 s0, s1, s0 +; GFX9-FLATSCR-NEXT: s_add_i32 s1, s1, s0 +; GFX9-FLATSCR-NEXT: s_mul_hi_u32 s0, s6, s1 +; GFX9-FLATSCR-NEXT: s_mul_i32 s1, s0, s7 +; GFX9-FLATSCR-NEXT: s_sub_i32 s1, s6, s1 +; GFX9-FLATSCR-NEXT: s_add_i32 s2, s0, 1 +; GFX9-FLATSCR-NEXT: s_sub_i32 s3, s1, s7 +; GFX9-FLATSCR-NEXT: s_cmp_ge_u32 s1, s7 +; GFX9-FLATSCR-NEXT: s_cselect_b32 s0, s2, s0 +; GFX9-FLATSCR-NEXT: s_cselect_b32 s1, s3, s1 +; GFX9-FLATSCR-NEXT: s_add_i32 s2, s0, 1 +; GFX9-FLATSCR-NEXT: s_cmp_ge_u32 s1, s7 +; GFX9-FLATSCR-NEXT: s_cselect_b32 s0, s2, s0 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-FLATSCR-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll index 920ff8a927e2d..45a1afbf11992 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll @@ -32,21 +32,21 @@ define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, float %x ; ; GFX9-LABEL: s_cvt_pkrtz_v2f16_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, s2, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, s6, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_cvt_pkrtz_v2f16_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, s2, s3 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, s6, s7 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_cvt_pkrtz_v2f16_f32: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll index 17b941c59fd3f..5d20a848bd6a6 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll @@ -27,12 +27,12 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float ; ; SDAG-GFX10-LABEL: v_fcmp_f32_oeq_with_fabs: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_eq_f32_e64 s2, s2, |s3| -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_eq_f32_e64 s0, s6, |s7| +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_oeq_with_fabs: @@ -50,12 +50,12 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float ; ; GISEL-GFX10-LABEL: v_fcmp_f32_oeq_with_fabs: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_eq_f32_e64 s2, s2, |s3| -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_eq_f32_e64 s0, s6, |s7| +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GISEL-GFX10-NEXT: s_endpgm %temp = call float @llvm.fabs.f32(float %a) %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float %temp, i32 1) @@ -78,12 +78,12 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace( ; ; SDAG-GFX10-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_eq_f32_e64 s2, |s2|, |s3| -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_eq_f32_e64 s0, |s6|, |s7| +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs: @@ -101,12 +101,12 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace( ; ; GISEL-GFX10-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_eq_f32_e64 s2, |s2|, |s3| -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_eq_f32_e64 s0, |s6|, |s7| +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GISEL-GFX10-NEXT: s_endpgm %temp = call float @llvm.fabs.f32(float %a) %src_input = call float @llvm.fabs.f32(float %src) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll index ce055d6527996..674fec1b865a6 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll @@ -30,14 +30,14 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float ; ; GFX9-LABEL: v_fcmp_f32_oeq_with_fabs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, |v0| -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_cmp_eq_f32_e64 s[0:1], s6, |v0| +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_oeq_with_fabs: @@ -88,14 +88,14 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace( ; ; GFX9-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_cmp_eq_f32_e64 s[2:3], |s2|, |v0| -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_cmp_eq_f32_e64 s[0:1], |s6|, |v0| +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll index 309fd99031155..8fe85e49a4207 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll @@ -1759,16 +1759,16 @@ define amdgpu_kernel void @v_icmp_i1_ne0(ptr addrspace(1) %out, i32 %a, i32 %b) ; ; GFX10-LABEL: v_icmp_i1_ne0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_cmp_gt_u32 s2, 1 -; GFX10-NEXT: s_cselect_b32 s2, -1, 0 -; GFX10-NEXT: s_cmp_gt_u32 s3, 2 -; GFX10-NEXT: s_cselect_b32 s3, -1, 0 -; GFX10-NEXT: s_and_b32 s2, s2, s3 -; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_cmp_gt_u32 s6, 1 +; GFX10-NEXT: s_cselect_b32 s0, -1, 0 +; GFX10-NEXT: s_cmp_gt_u32 s7, 2 +; GFX10-NEXT: s_cselect_b32 s1, -1, 0 +; GFX10-NEXT: s_and_b32 s0, s0, s1 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm %c0 = icmp ugt i32 %a, 1 %c1 = icmp ugt i32 %b, 2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll index 5f979e0177f58..a650f999835c6 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll @@ -1986,17 +1986,17 @@ define amdgpu_kernel void @v_icmp_i1_ne0(ptr addrspace(1) %out, i32 %a, i32 %b) ; ; GFX9-LABEL: v_icmp_i1_ne0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_cmp_gt_u32 s2, 1 -; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX9-NEXT: s_cmp_gt_u32 s3, 2 +; GFX9-NEXT: s_cmp_gt_u32 s6, 1 +; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX9-NEXT: s_cmp_gt_u32 s7, 2 ; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX9-NEXT: s_and_b64 s[2:3], s[4:5], s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm %c0 = icmp ugt i32 %a, 1 %c1 = icmp ugt i32 %b, 2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll index dba67a03c000e..b0706025f0b68 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll @@ -14,12 +14,12 @@ entry: define amdgpu_kernel void @test_iglp_opt_mfma_gemm(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { ; GCN-LABEL: test_iglp_opt_mfma_gemm: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 2.0 ; GCN-NEXT: ; iglp_opt mask(0x00000000) ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_add_u32_e32 v1, s0, v0 +; GCN-NEXT: v_add_u32_e32 v1, s2, v0 ; GCN-NEXT: v_add_u32_e32 v2, 0x6000, v1 ; GCN-NEXT: ds_read_b128 a[28:31], v2 offset:57456 ; GCN-NEXT: ds_read_b128 a[24:27], v2 offset:57440 @@ -44,7 +44,7 @@ define amdgpu_kernel void @test_iglp_opt_mfma_gemm(ptr addrspace(3) noalias %in, ; GCN-NEXT: ds_read_b128 a[152:155], v1 offset:96 ; GCN-NEXT: ds_read_b128 a[68:71], v1 offset:24592 ; GCN-NEXT: ds_read_b128 a[64:67], v1 offset:24576 -; GCN-NEXT: v_add_u32_e32 v0, s1, v0 +; GCN-NEXT: v_add_u32_e32 v0, s3, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(4) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v3, a[32:63] ; GCN-NEXT: ds_read_b128 a[148:151], v1 offset:80 @@ -80,7 +80,7 @@ define amdgpu_kernel void @test_iglp_opt_mfma_gemm(ptr addrspace(3) noalias %in, ; GCN-NEXT: ds_write_b128 v0, a[136:139] offset:32 ; GCN-NEXT: ds_write_b128 v0, a[132:135] offset:16 ; GCN-NEXT: ds_write_b128 v0, a[128:131] -; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: s_waitcnt lgkmcnt(8) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v3, a[64:95] ; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:24672 @@ -151,13 +151,13 @@ entry: define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { ; GCN-LABEL: test_iglp_opt_rev_mfma_gemm: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GCN-NEXT: v_mov_b32_e32 v2, 1.0 ; GCN-NEXT: v_mov_b32_e32 v3, 2.0 ; GCN-NEXT: ; iglp_opt mask(0x00000001) ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_add_u32_e32 v1, s0, v0 +; GCN-NEXT: v_add_u32_e32 v1, s2, v0 ; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:112 ; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:96 ; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:80 @@ -176,7 +176,7 @@ define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(ptr addrspace(3) noalias ; GCN-NEXT: ds_read_b128 a[136:139], v1 offset:8224 ; GCN-NEXT: ds_read_b128 a[132:135], v1 offset:8208 ; GCN-NEXT: ds_read_b128 a[128:131], v1 offset:8192 -; GCN-NEXT: v_add_u32_e32 v0, s1, v0 +; GCN-NEXT: v_add_u32_e32 v0, s3, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v3, a[128:159] ; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:24688 @@ -218,7 +218,7 @@ define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(ptr addrspace(3) noalias ; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32 ; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:16 ; GCN-NEXT: ds_write_b128 v0, a[0:3] -; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: ds_write_b128 v0, a[152:155] offset:8288 ; GCN-NEXT: ds_write_b128 v0, a[156:159] offset:8304 ; GCN-NEXT: ds_write_b128 v0, a[144:147] offset:8256 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll index 265d64f47bb23..cd92529b77165 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll @@ -245,24 +245,26 @@ define amdgpu_kernel void @v_permlane16_b32_vvv(ptr addrspace(1) %out, i32 %src0 define amdgpu_kernel void @v_permlane16_b32_vvs(ptr addrspace(1) %out, i32 %src0, i32 %src2) { ; GFX10-SDAG-LABEL: v_permlane16_b32_vvs: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_mov_b32 null, 0 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 -; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s7 +; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vvs: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_mov_b32 null, 0 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s3 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s7 +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vvs: @@ -325,27 +327,17 @@ define amdgpu_kernel void @v_permlane16_b32_vvs(ptr addrspace(1) %out, i32 %src0 } define amdgpu_kernel void @v_permlane16_b32_vsv(ptr addrspace(1) %out, i32 %src0, i32 %src1) { -; GFX10-SDAG-LABEL: v_permlane16_b32_vsv: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v1 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s3, s2 -; GFX10-SDAG-NEXT: global_store_dword v1, v0, s[0:1] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlane16_b32_vsv: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v1 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] -; GFX10-GISEL-NEXT: s_endpgm +; GFX10-LABEL: v_permlane16_b32_vsv: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_mov_b32 null, 0 +; GFX10-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s0 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vsv: ; GFX11-SDAG: ; %bb.0: @@ -781,24 +773,26 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv(ptr addrspace(1) %out, i32 %src define amdgpu_kernel void @v_permlanex16_b32_vvs(ptr addrspace(1) %out, i32 %src0, i32 %src2) { ; GFX10-SDAG-LABEL: v_permlanex16_b32_vvs: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_mov_b32 null, 0 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 -; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s7 +; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vvs: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_mov_b32 null, 0 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s3 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s7 +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vvs: @@ -861,27 +855,17 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs(ptr addrspace(1) %out, i32 %src } define amdgpu_kernel void @v_permlanex16_b32_vsv(ptr addrspace(1) %out, i32 %src0, i32 %src1) { -; GFX10-SDAG-LABEL: v_permlanex16_b32_vsv: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v1 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s3, s2 -; GFX10-SDAG-NEXT: global_store_dword v1, v0, s[0:1] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlanex16_b32_vsv: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v1 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] -; GFX10-GISEL-NEXT: s_endpgm +; GFX10-LABEL: v_permlanex16_b32_vsv: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_mov_b32 null, 0 +; GFX10-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s0 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vsv: ; GFX11-SDAG: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll index 10f09b6390aba..ae5b62ffb285b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll @@ -29,18 +29,18 @@ entry: define amdgpu_kernel void @test_sched_group_barrier_pipeline_READ_VALU_WRITE(ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_READ_VALU_WRITE: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v32, 7, v0 -; GCN-NEXT: ; kill: killed $sgpr0_sgpr1 +; GCN-NEXT: ; kill: killed $sgpr4_sgpr5 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] -; GCN-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16 -; GCN-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32 -; GCN-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48 -; GCN-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64 -; GCN-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80 -; GCN-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96 -; GCN-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112 +; GCN-NEXT: global_load_dwordx4 v[0:3], v32, s[4:5] +; GCN-NEXT: global_load_dwordx4 v[4:7], v32, s[4:5] offset:16 +; GCN-NEXT: global_load_dwordx4 v[8:11], v32, s[4:5] offset:32 +; GCN-NEXT: global_load_dwordx4 v[12:15], v32, s[4:5] offset:48 +; GCN-NEXT: global_load_dwordx4 v[16:19], v32, s[4:5] offset:64 +; GCN-NEXT: global_load_dwordx4 v[20:23], v32, s[4:5] offset:80 +; GCN-NEXT: global_load_dwordx4 v[24:27], v32, s[4:5] offset:96 +; GCN-NEXT: global_load_dwordx4 v[28:31], v32, s[4:5] offset:112 ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(8) SyncID(0) ; GCN-NEXT: s_waitcnt vmcnt(7) ; GCN-NEXT: v_mul_lo_u32 v3, v3, v3 @@ -82,32 +82,32 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_READ_VALU_WRITE(ptr ; GCN-NEXT: v_mul_lo_u32 v30, v30, v30 ; GCN-NEXT: v_mul_lo_u32 v29, v29, v29 ; GCN-NEXT: v_mul_lo_u32 v28, v28, v28 -; GCN-NEXT: global_store_dwordx4 v32, v[28:31], s[2:3] offset:112 -; GCN-NEXT: global_store_dwordx4 v32, v[24:27], s[2:3] offset:96 -; GCN-NEXT: global_store_dwordx4 v32, v[20:23], s[2:3] offset:80 -; GCN-NEXT: global_store_dwordx4 v32, v[16:19], s[2:3] offset:64 -; GCN-NEXT: global_store_dwordx4 v32, v[12:15], s[2:3] offset:48 -; GCN-NEXT: global_store_dwordx4 v32, v[8:11], s[2:3] offset:32 -; GCN-NEXT: global_store_dwordx4 v32, v[4:7], s[2:3] offset:16 -; GCN-NEXT: global_store_dwordx4 v32, v[0:3], s[2:3] +; GCN-NEXT: global_store_dwordx4 v32, v[28:31], s[6:7] offset:112 +; GCN-NEXT: global_store_dwordx4 v32, v[24:27], s[6:7] offset:96 +; GCN-NEXT: global_store_dwordx4 v32, v[20:23], s[6:7] offset:80 +; GCN-NEXT: global_store_dwordx4 v32, v[16:19], s[6:7] offset:64 +; GCN-NEXT: global_store_dwordx4 v32, v[12:15], s[6:7] offset:48 +; GCN-NEXT: global_store_dwordx4 v32, v[8:11], s[6:7] offset:32 +; GCN-NEXT: global_store_dwordx4 v32, v[4:7], s[6:7] offset:16 +; GCN-NEXT: global_store_dwordx4 v32, v[0:3], s[6:7] ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(30) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(8) SyncID(0) ; GCN-NEXT: s_endpgm ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_READ_VALU_WRITE: ; EXACTCUTOFF: ; %bb.0: -; EXACTCUTOFF-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; EXACTCUTOFF-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v32, 7, v0 -; EXACTCUTOFF-NEXT: ; kill: killed $sgpr0_sgpr1 +; EXACTCUTOFF-NEXT: ; kill: killed $sgpr4_sgpr5 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v32, s[4:5] +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[4:7], v32, s[4:5] offset:16 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[8:11], v32, s[4:5] offset:32 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[12:15], v32, s[4:5] offset:48 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[16:19], v32, s[4:5] offset:64 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[20:23], v32, s[4:5] offset:80 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[24:27], v32, s[4:5] offset:96 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[28:31], v32, s[4:5] offset:112 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(8) SyncID(0) ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(7) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v3, v3, v3 @@ -149,14 +149,14 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_READ_VALU_WRITE(ptr ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v30, v30, v30 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v29, v29, v29 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v28, v28, v28 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[28:31], s[2:3] offset:112 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[24:27], s[2:3] offset:96 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[20:23], s[2:3] offset:80 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[16:19], s[2:3] offset:64 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[12:15], s[2:3] offset:48 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[8:11], s[2:3] offset:32 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[4:7], s[2:3] offset:16 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[0:3], s[2:3] +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[28:31], s[6:7] offset:112 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[24:27], s[6:7] offset:96 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[20:23], s[6:7] offset:80 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[16:19], s[6:7] offset:64 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[12:15], s[6:7] offset:48 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[8:11], s[6:7] offset:32 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[4:7], s[6:7] offset:16 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[0:3], s[6:7] ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(30) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(8) SyncID(0) ; EXACTCUTOFF-NEXT: s_endpgm @@ -178,17 +178,17 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_READ_VALU_WRITE(ptr define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VALU(ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v32, 7, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:16 -; GCN-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:96 +; GCN-NEXT: global_load_dwordx4 v[28:31], v32, s[4:5] offset:16 +; GCN-NEXT: global_load_dwordx4 v[8:11], v32, s[4:5] offset:96 ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_mul_lo_u32 v29, v29, v29 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_lo_u32 v9, v9, v9 -; GCN-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] +; GCN-NEXT: global_load_dwordx4 v[0:3], v32, s[4:5] ; GCN-NEXT: v_mul_lo_u32 v8, v8, v8 ; GCN-NEXT: v_mul_lo_u32 v28, v28, v28 ; GCN-NEXT: v_mul_lo_u32 v31, v31, v31 @@ -198,12 +198,12 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_lo_u32 v3, v3, v3 ; GCN-NEXT: v_mul_lo_u32 v2, v2, v2 -; GCN-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:112 +; GCN-NEXT: global_load_dwordx4 v[4:7], v32, s[4:5] offset:112 ; GCN-NEXT: v_mul_lo_u32 v1, v1, v1 ; GCN-NEXT: v_mul_lo_u32 v0, v0, v0 ; GCN-NEXT: v_mul_lo_u32 v11, v11, v11 ; GCN-NEXT: v_mul_lo_u32 v10, v10, v10 -; GCN-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48 +; GCN-NEXT: global_load_dwordx4 v[12:15], v32, s[4:5] offset:48 ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) @@ -218,11 +218,11 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_lo_u32 v13, v13, v13 ; GCN-NEXT: v_mul_lo_u32 v15, v15, v15 -; GCN-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:80 +; GCN-NEXT: global_load_dwordx4 v[16:19], v32, s[4:5] offset:80 ; GCN-NEXT: v_mul_lo_u32 v14, v14, v14 ; GCN-NEXT: v_mul_lo_u32 v12, v12, v12 -; GCN-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:64 -; GCN-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:32 +; GCN-NEXT: global_load_dwordx4 v[20:23], v32, s[4:5] offset:64 +; GCN-NEXT: global_load_dwordx4 v[24:27], v32, s[4:5] offset:32 ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) @@ -242,14 +242,14 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; GCN-NEXT: v_mul_lo_u32 v21, v21, v21 ; GCN-NEXT: v_mul_lo_u32 v20, v20, v20 ; GCN-NEXT: v_mul_lo_u32 v16, v16, v16 -; GCN-NEXT: global_store_dwordx4 v32, v[4:7], s[2:3] offset:112 -; GCN-NEXT: global_store_dwordx4 v32, v[8:11], s[2:3] offset:96 -; GCN-NEXT: global_store_dwordx4 v32, v[16:19], s[2:3] offset:80 -; GCN-NEXT: global_store_dwordx4 v32, v[20:23], s[2:3] offset:64 -; GCN-NEXT: global_store_dwordx4 v32, v[12:15], s[2:3] offset:48 -; GCN-NEXT: global_store_dwordx4 v32, v[24:27], s[2:3] offset:32 -; GCN-NEXT: global_store_dwordx4 v32, v[28:31], s[2:3] offset:16 -; GCN-NEXT: global_store_dwordx4 v32, v[0:3], s[2:3] +; GCN-NEXT: global_store_dwordx4 v32, v[4:7], s[6:7] offset:112 +; GCN-NEXT: global_store_dwordx4 v32, v[8:11], s[6:7] offset:96 +; GCN-NEXT: global_store_dwordx4 v32, v[16:19], s[6:7] offset:80 +; GCN-NEXT: global_store_dwordx4 v32, v[20:23], s[6:7] offset:64 +; GCN-NEXT: global_store_dwordx4 v32, v[12:15], s[6:7] offset:48 +; GCN-NEXT: global_store_dwordx4 v32, v[24:27], s[6:7] offset:32 +; GCN-NEXT: global_store_dwordx4 v32, v[28:31], s[6:7] offset:16 +; GCN-NEXT: global_store_dwordx4 v32, v[0:3], s[6:7] ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) @@ -258,17 +258,17 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU: ; EXACTCUTOFF: ; %bb.0: -; EXACTCUTOFF-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; EXACTCUTOFF-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v32, 7, v0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:16 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:96 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[28:31], v32, s[4:5] offset:16 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[8:11], v32, s[4:5] offset:96 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(1) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v29, v29, v29 ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v9, v9, v9 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v32, s[4:5] ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v8, v8, v8 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v28, v28, v28 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v31, v31, v31 @@ -278,12 +278,12 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v3, v3, v3 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v2, v2, v2 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:112 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[4:7], v32, s[4:5] offset:112 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v1, v1, v1 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v0, v0, v0 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v11, v11, v11 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v10, v10, v10 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[12:15], v32, s[4:5] offset:48 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) @@ -298,11 +298,11 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v13, v13, v13 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v15, v15, v15 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:80 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[16:19], v32, s[4:5] offset:80 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v14, v14, v14 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v12, v12, v12 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:64 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:32 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[20:23], v32, s[4:5] offset:64 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[24:27], v32, s[4:5] offset:32 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) @@ -322,14 +322,14 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v21, v21, v21 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v20, v20, v20 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v16, v16, v16 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[4:7], s[2:3] offset:112 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[8:11], s[2:3] offset:96 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[16:19], s[2:3] offset:80 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[20:23], s[2:3] offset:64 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[12:15], s[2:3] offset:48 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[24:27], s[2:3] offset:32 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[28:31], s[2:3] offset:16 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[0:3], s[2:3] +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[4:7], s[6:7] offset:112 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[8:11], s[6:7] offset:96 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[16:19], s[6:7] offset:80 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[20:23], s[6:7] offset:64 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[12:15], s[6:7] offset:48 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[24:27], s[6:7] offset:32 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[28:31], s[6:7] offset:16 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[0:3], s[6:7] ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) @@ -381,18 +381,18 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VALU_WRITE(ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU_WRITE: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v16, 7, v0 -; GCN-NEXT: ; kill: killed $sgpr0_sgpr1 +; GCN-NEXT: ; kill: killed $sgpr4_sgpr5 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:32 -; GCN-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:48 +; GCN-NEXT: global_load_dwordx4 v[12:15], v16, s[4:5] offset:32 +; GCN-NEXT: global_load_dwordx4 v[4:7], v16, s[4:5] offset:48 ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_mul_lo_u32 v13, v13, v13 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_lo_u32 v7, v7, v7 -; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] +; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[4:5] ; GCN-NEXT: v_mul_lo_u32 v6, v6, v6 ; GCN-NEXT: v_mul_lo_u32 v12, v12, v12 ; GCN-NEXT: v_mul_lo_u32 v15, v15, v15 @@ -403,25 +403,25 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; GCN-NEXT: v_mul_lo_u32 v2, v2, v2 ; GCN-NEXT: v_mul_lo_u32 v1, v1, v1 ; GCN-NEXT: v_mul_lo_u32 v0, v0, v0 -; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] -; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:112 +; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] +; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[4:5] offset:112 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_lo_u32 v3, v3, v3 ; GCN-NEXT: v_mul_lo_u32 v2, v2, v2 ; GCN-NEXT: v_mul_lo_u32 v1, v1, v1 ; GCN-NEXT: v_mul_lo_u32 v0, v0, v0 -; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:112 -; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:96 +; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] offset:112 +; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[4:5] offset:96 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_lo_u32 v3, v3, v3 ; GCN-NEXT: v_mul_lo_u32 v2, v2, v2 ; GCN-NEXT: v_mul_lo_u32 v1, v1, v1 ; GCN-NEXT: v_mul_lo_u32 v0, v0, v0 -; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:96 +; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] offset:96 ; GCN-NEXT: v_mul_lo_u32 v5, v5, v5 ; GCN-NEXT: v_mul_lo_u32 v4, v4, v4 -; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:48 -; GCN-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:64 +; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:48 +; GCN-NEXT: global_load_dwordx4 v[4:7], v16, s[4:5] offset:64 ; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) @@ -430,9 +430,9 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; GCN-NEXT: v_mul_lo_u32 v6, v6, v6 ; GCN-NEXT: v_mul_lo_u32 v5, v5, v5 ; GCN-NEXT: v_mul_lo_u32 v4, v4, v4 -; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:64 -; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:32 -; GCN-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:16 +; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:64 +; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:32 +; GCN-NEXT: global_load_dwordx4 v[8:11], v16, s[4:5] offset:16 ; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) @@ -452,15 +452,15 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; GCN-NEXT: v_mul_lo_u32 v8, v8, v8 ; GCN-NEXT: v_mul_lo_u32 v11, v11, v11 ; GCN-NEXT: v_mul_lo_u32 v10, v10, v10 -; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:16 -; GCN-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:80 +; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:16 +; GCN-NEXT: global_load_dwordx4 v[8:11], v16, s[4:5] offset:80 ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_lo_u32 v11, v11, v11 ; GCN-NEXT: v_mul_lo_u32 v10, v10, v10 ; GCN-NEXT: v_mul_lo_u32 v9, v9, v9 ; GCN-NEXT: v_mul_lo_u32 v8, v8, v8 -; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:80 +; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:80 ; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) @@ -469,18 +469,18 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU_WRITE: ; EXACTCUTOFF: ; %bb.0: -; EXACTCUTOFF-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; EXACTCUTOFF-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v16, 7, v0 -; EXACTCUTOFF-NEXT: ; kill: killed $sgpr0_sgpr1 +; EXACTCUTOFF-NEXT: ; kill: killed $sgpr4_sgpr5 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:32 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:48 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[12:15], v16, s[4:5] offset:32 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[4:7], v16, s[4:5] offset:48 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(1) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v13, v13, v13 ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v7, v7, v7 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v16, s[4:5] ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v6, v6, v6 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v12, v12, v12 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v15, v15, v15 @@ -491,25 +491,25 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v2, v2, v2 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v1, v1, v1 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v0, v0, v0 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:112 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v16, s[4:5] offset:112 ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v3, v3, v3 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v2, v2, v2 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v1, v1, v1 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v0, v0, v0 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:112 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:96 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] offset:112 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v16, s[4:5] offset:96 ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v3, v3, v3 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v2, v2, v2 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v1, v1, v1 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v0, v0, v0 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:96 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] offset:96 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v5, v5, v5 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v4, v4, v4 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:48 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:64 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:48 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[4:7], v16, s[4:5] offset:64 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) @@ -518,9 +518,9 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v6, v6, v6 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v5, v5, v5 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v4, v4, v4 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:64 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:32 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:16 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:64 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:32 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[8:11], v16, s[4:5] offset:16 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) @@ -540,15 +540,15 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v8, v8, v8 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v11, v11, v11 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v10, v10, v10 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:16 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:80 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:16 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[8:11], v16, s[4:5] offset:80 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v11, v11, v11 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v10, v10, v10 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v9, v9, v9 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v8, v8, v8 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:80 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:80 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) @@ -614,10 +614,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_MFMA_cluster: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_add_u32_e32 v1, s0, v0 +; GCN-NEXT: v_add_u32_e32 v1, s2, v0 ; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:112 ; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:96 ; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:80 @@ -661,7 +661,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad ; GCN-NEXT: ds_read_b128 a[136:139], v2 offset:57376 ; GCN-NEXT: ds_read_b128 a[140:143], v2 offset:57392 ; GCN-NEXT: v_mov_b32_e32 v2, 2.0 -; GCN-NEXT: v_add_u32_e32 v0, s1, v0 +; GCN-NEXT: v_add_u32_e32 v0, s3, v0 ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(40) SyncID(0) ; GCN-NEXT: s_waitcnt lgkmcnt(14) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] @@ -681,7 +681,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad ; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32 ; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:16 ; GCN-NEXT: ds_write_b128 v0, a[0:3] -; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:8288 ; GCN-NEXT: ds_write_b128 v0, a[60:63] offset:8304 ; GCN-NEXT: ds_write_b128 v0, a[48:51] offset:8256 @@ -720,10 +720,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_MFMA_cluster: ; EXACTCUTOFF: ; %bb.0: ; %entry -; EXACTCUTOFF-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; EXACTCUTOFF-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s0, v0 +; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s2, v0 ; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:112 ; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:96 ; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:80 @@ -767,7 +767,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad ; EXACTCUTOFF-NEXT: ds_read_b128 a[136:139], v2 offset:57376 ; EXACTCUTOFF-NEXT: ds_read_b128 a[140:143], v2 offset:57392 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v2, 2.0 -; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s1, v0 +; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s3, v0 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(40) SyncID(0) ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(14) ; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] @@ -787,7 +787,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:32 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:16 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v0, s1 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v0, s3 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[56:59] offset:8288 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[60:63] offset:8304 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[48:51] offset:8256 @@ -862,12 +862,12 @@ entry: define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 7, v0 ; GCN-NEXT: v_mov_b32_e32 v2, 1.0 ; GCN-NEXT: v_mov_b32_e32 v3, 2.0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_add_u32_e32 v0, s0, v1 +; GCN-NEXT: v_add_u32_e32 v0, s2, v1 ; GCN-NEXT: ds_read_b128 a[28:31], v0 offset:112 ; GCN-NEXT: ds_read_b128 a[24:27], v0 offset:96 ; GCN-NEXT: ds_read_b128 a[20:23], v0 offset:80 @@ -878,7 +878,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-NEXT: ds_read_b128 a[12:15], v0 offset:48 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] -; GCN-NEXT: v_add_u32_e32 v1, s1, v1 +; GCN-NEXT: v_add_u32_e32 v1, s3, v1 ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-NEXT: s_nop 7 @@ -902,7 +902,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-NEXT: ds_read_b128 a[0:3], v0 offset:8192 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] -; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) @@ -995,12 +995,12 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave: ; EXACTCUTOFF: ; %bb.0: ; %entry -; EXACTCUTOFF-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; EXACTCUTOFF-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v1, 7, v0 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v2, 1.0 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v3, 2.0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s0, v1 +; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s2, v1 ; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v0 offset:112 ; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v0 offset:96 ; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v0 offset:80 @@ -1011,7 +1011,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v0 offset:48 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] -; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s1, v1 +; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s3, v1 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_nop 7 @@ -1035,7 +1035,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v0 offset:8192 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v1, s1 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v1, s3 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll index 3a867879bb809..4ce0ff20e3b73 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll @@ -440,59 +440,59 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; GFX900-SDAG-LABEL: s_exp_v2f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x32a5705f ; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0xc2ce8ed0 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s3, v0 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s7, v0 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v3, v2 -; GFX900-SDAG-NEXT: v_fma_f32 v4, s3, v0, -v2 +; GFX900-SDAG-NEXT: v_fma_f32 v4, s7, v0, -v2 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX900-SDAG-NEXT: v_fma_f32 v4, s3, v1, v4 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v6, s2, v0 +; GFX900-SDAG-NEXT: v_fma_f32 v4, s7, v1, v4 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v6, s6, v0 ; GFX900-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v7, v6 -; GFX900-SDAG-NEXT: v_fma_f32 v0, s2, v0, -v6 +; GFX900-SDAG-NEXT: v_fma_f32 v0, s6, v0, -v6 ; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v2, v2 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v8, v6, v7 -; GFX900-SDAG-NEXT: v_fma_f32 v0, s2, v1, v0 +; GFX900-SDAG-NEXT: v_fma_f32 v0, s6, v1, v0 ; GFX900-SDAG-NEXT: v_add_f32_e32 v0, v8, v0 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v6, v7 ; GFX900-SDAG-NEXT: v_ldexp_f32 v2, v2, v3 -; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s3, v5 +; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v5 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0x42b17218 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; GFX900-SDAG-NEXT: v_mov_b32_e32 v7, 0x7f800000 -; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s3, v3 +; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s7, v3 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v6 -; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v5 +; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s6, v5 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v3 +; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v3 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc -; GFX900-SDAG-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; GFX900-SDAG-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_exp_v2f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x32a5705f ; GFX900-GISEL-NEXT: v_mov_b32_e32 v6, 0x7f800000 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s2, v0 -; GFX900-GISEL-NEXT: v_fma_f32 v3, s2, v0, -v2 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s6, v0 +; GFX900-GISEL-NEXT: v_fma_f32 v3, s6, v0, -v2 ; GFX900-GISEL-NEXT: v_rndne_f32_e32 v4, v2 -; GFX900-GISEL-NEXT: v_mul_f32_e32 v5, s3, v0 -; GFX900-GISEL-NEXT: v_fma_f32 v3, s2, v1, v3 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v5, s7, v0 +; GFX900-GISEL-NEXT: v_fma_f32 v3, s6, v1, v3 ; GFX900-GISEL-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX900-GISEL-NEXT: v_fma_f32 v0, s3, v0, -v5 +; GFX900-GISEL-NEXT: v_fma_f32 v0, s7, v0, -v5 ; GFX900-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 -; GFX900-GISEL-NEXT: v_fma_f32 v0, s3, v1, v0 +; GFX900-GISEL-NEXT: v_fma_f32 v0, s7, v1, v0 ; GFX900-GISEL-NEXT: v_rndne_f32_e32 v1, v5 ; GFX900-GISEL-NEXT: v_cvt_i32_f32_e32 v3, v4 ; GFX900-GISEL-NEXT: v_exp_f32_e32 v2, v2 @@ -502,18 +502,18 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; GFX900-GISEL-NEXT: v_exp_f32_e32 v5, v0 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0xc2ce8ed0 ; GFX900-GISEL-NEXT: v_ldexp_f32 v2, v2, v3 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v4 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v4 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x42b17218 ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc -; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v3 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s6, v3 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc ; GFX900-GISEL-NEXT: v_ldexp_f32 v1, v5, v1 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s3, v4 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s7, v4 ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc -; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s3, v3 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s7, v3 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX900-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX900-GISEL-NEXT: s_endpgm ; ; SI-SDAG-LABEL: s_exp_v2f32: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll index a162949587481..5ab960f47f57b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll @@ -442,59 +442,59 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX900-SDAG-LABEL: s_exp10_v2f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x33979a37 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0xc23369f4 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s3, v0 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s7, v0 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v3, v2 -; GFX900-SDAG-NEXT: v_fma_f32 v4, s3, v0, -v2 +; GFX900-SDAG-NEXT: v_fma_f32 v4, s7, v0, -v2 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX900-SDAG-NEXT: v_fma_f32 v4, s3, v1, v4 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v6, s2, v0 +; GFX900-SDAG-NEXT: v_fma_f32 v4, s7, v1, v4 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v6, s6, v0 ; GFX900-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v7, v6 -; GFX900-SDAG-NEXT: v_fma_f32 v0, s2, v0, -v6 +; GFX900-SDAG-NEXT: v_fma_f32 v0, s6, v0, -v6 ; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v2, v2 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v8, v6, v7 -; GFX900-SDAG-NEXT: v_fma_f32 v0, s2, v1, v0 +; GFX900-SDAG-NEXT: v_fma_f32 v0, s6, v1, v0 ; GFX900-SDAG-NEXT: v_add_f32_e32 v0, v8, v0 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v6, v7 ; GFX900-SDAG-NEXT: v_ldexp_f32 v2, v2, v3 -; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s3, v5 +; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v5 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0x421a209b ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; GFX900-SDAG-NEXT: v_mov_b32_e32 v7, 0x7f800000 -; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s3, v3 +; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s7, v3 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v6 -; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v5 +; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s6, v5 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v3 +; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v3 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc -; GFX900-SDAG-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; GFX900-SDAG-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_exp10_v2f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x33979a37 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v6, 0x7f800000 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s2, v0 -; GFX900-GISEL-NEXT: v_fma_f32 v3, s2, v0, -v2 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s6, v0 +; GFX900-GISEL-NEXT: v_fma_f32 v3, s6, v0, -v2 ; GFX900-GISEL-NEXT: v_rndne_f32_e32 v4, v2 -; GFX900-GISEL-NEXT: v_mul_f32_e32 v5, s3, v0 -; GFX900-GISEL-NEXT: v_fma_f32 v3, s2, v1, v3 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v5, s7, v0 +; GFX900-GISEL-NEXT: v_fma_f32 v3, s6, v1, v3 ; GFX900-GISEL-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX900-GISEL-NEXT: v_fma_f32 v0, s3, v0, -v5 +; GFX900-GISEL-NEXT: v_fma_f32 v0, s7, v0, -v5 ; GFX900-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 -; GFX900-GISEL-NEXT: v_fma_f32 v0, s3, v1, v0 +; GFX900-GISEL-NEXT: v_fma_f32 v0, s7, v1, v0 ; GFX900-GISEL-NEXT: v_rndne_f32_e32 v1, v5 ; GFX900-GISEL-NEXT: v_cvt_i32_f32_e32 v3, v4 ; GFX900-GISEL-NEXT: v_exp_f32_e32 v2, v2 @@ -504,18 +504,18 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; GFX900-GISEL-NEXT: v_exp_f32_e32 v5, v0 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0xc23369f4 ; GFX900-GISEL-NEXT: v_ldexp_f32 v2, v2, v3 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v4 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v4 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x421a209b ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc -; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v3 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s6, v3 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc ; GFX900-GISEL-NEXT: v_ldexp_f32 v1, v5, v1 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s3, v4 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s7, v4 ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc -; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s3, v3 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s7, v3 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX900-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX900-GISEL-NEXT: s_endpgm ; ; SI-SDAG-LABEL: s_exp10_v2f32: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll index 36e78975cdb01..6cca705f7b1db 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll @@ -270,25 +270,25 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX900-SDAG-LABEL: s_exp2_v2f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v3, 1.0, v1, vcc ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v2, vcc -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc -; GFX900-SDAG-NEXT: v_add_f32_e32 v4, s3, v4 -; GFX900-SDAG-NEXT: v_add_f32_e32 v1, s2, v1 +; GFX900-SDAG-NEXT: v_add_f32_e32 v4, s7, v4 +; GFX900-SDAG-NEXT: v_add_f32_e32 v1, s6, v1 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v4, v4 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v2, v1 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, v4, v3 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, v2, v0 -; GFX900-SDAG-NEXT: global_store_dwordx2 v5, v[0:1], s[0:1] +; GFX900-SDAG-NEXT: global_store_dwordx2 v5, v[0:1], s[4:5] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_exp2_v2f32: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll index 035b2439eff15..90a15ae8d9b28 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll @@ -313,25 +313,25 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX900-SDAG-LABEL: s_log2_v2f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v1, vcc ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v4, 1.0, v2, vcc -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e32 v4, s3, v4 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, s2, v1 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v4, s7, v4 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, s6, v1 ; GFX900-SDAG-NEXT: v_log_f32_e32 v4, v4 ; GFX900-SDAG-NEXT: v_log_f32_e32 v2, v1 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v4, v3 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v2, v0 -; GFX900-SDAG-NEXT: global_store_dwordx2 v5, v[0:1], s[0:1] +; GFX900-SDAG-NEXT: global_store_dwordx2 v5, v[0:1], s[4:5] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_log2_v2f32: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.ll index 7ad7cc821c1b5..8196999b8f1f1 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.round.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.round.ll @@ -133,31 +133,57 @@ define amdgpu_kernel void @round_v2f32(ptr addrspace(1) %out, <2 x float> %in) # ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; -; GFX89-LABEL: round_v2f32: -; GFX89: ; %bb.0: -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX89-NEXT: s_brev_b32 s8, -2 -; GFX89-NEXT: s_mov_b32 s7, 0xf000 -; GFX89-NEXT: s_mov_b32 s6, -1 -; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: v_trunc_f32_e32 v0, s3 -; GFX89-NEXT: v_sub_f32_e32 v1, s3, v0 -; GFX89-NEXT: s_mov_b32 s4, s0 -; GFX89-NEXT: s_mov_b32 s5, s1 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] -; GFX89-NEXT: v_mov_b32_e32 v2, s3 -; GFX89-NEXT: v_bfi_b32 v1, s8, v1, v2 -; GFX89-NEXT: v_add_f32_e32 v1, v0, v1 -; GFX89-NEXT: v_trunc_f32_e32 v0, s2 -; GFX89-NEXT: v_sub_f32_e32 v2, s2, v0 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[0:1] -; GFX89-NEXT: v_mov_b32_e32 v3, s2 -; GFX89-NEXT: v_bfi_b32 v2, s8, v2, v3 -; GFX89-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; GFX89-NEXT: s_endpgm +; GFX8-LABEL: round_v2f32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_brev_b32 s8, -2 +; GFX8-NEXT: s_mov_b32 s7, 0xf000 +; GFX8-NEXT: s_mov_b32 s6, -1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_trunc_f32_e32 v0, s3 +; GFX8-NEXT: v_sub_f32_e32 v1, s3, v0 +; GFX8-NEXT: s_mov_b32 s4, s0 +; GFX8-NEXT: s_mov_b32 s5, s1 +; GFX8-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, s3 +; GFX8-NEXT: v_bfi_b32 v1, s8, v1, v2 +; GFX8-NEXT: v_add_f32_e32 v1, v0, v1 +; GFX8-NEXT: v_trunc_f32_e32 v0, s2 +; GFX8-NEXT: v_sub_f32_e32 v2, s2, v0 +; GFX8-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, 0.5 +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_bfi_b32 v2, s8, v2, v3 +; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: round_v2f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_brev_b32 s8, -2 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_trunc_f32_e32 v0, s7 +; GFX9-NEXT: v_sub_f32_e32 v1, s7, v0 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: v_bfi_b32 v1, s8, v1, v2 +; GFX9-NEXT: v_add_f32_e32 v1, v0, v1 +; GFX9-NEXT: v_trunc_f32_e32 v0, s6 +; GFX9-NEXT: v_sub_f32_e32 v2, s6, v0 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: v_bfi_b32 v2, s8, v2, v3 +; GFX9-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: round_v2f32: ; GFX11: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll index 994ef22539a65..a54405bf1b471 100644 --- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll @@ -8,12 +8,12 @@ define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 { ; GFX9-LABEL: s_lshr_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_pk_lshrrev_b16 v1, s3, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_pk_lshrrev_b16 v1, s7, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: s_lshr_v2i16: @@ -54,11 +54,11 @@ define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, < ; ; GFX10-LABEL: s_lshr_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_lshrrev_b16 v1, s3, s2 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: v_pk_lshrrev_b16 v1, s7, s6 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_lshr_v2i16: diff --git a/llvm/test/CodeGen/AMDGPU/madak.ll b/llvm/test/CodeGen/AMDGPU/madak.ll index 9ec37a5e14cdf..944db3d3adc3a 100644 --- a/llvm/test/CodeGen/AMDGPU/madak.ll +++ b/llvm/test/CodeGen/AMDGPU/madak.ll @@ -220,40 +220,40 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad ; ; GFX9-LABEL: madak_2_use_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x41200000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] offset:8 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_madak_f32 v2, v1, v2, 0x41200000 ; GFX9-NEXT: v_mac_f32_e32 v4, v1, v3 -; GFX9-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-NEXT: global_store_dword v0, v2, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v0, v4, s[2:3] offset:4 +; GFX9-NEXT: global_store_dword v0, v4, s[6:7] offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; ; GFX10-MAD-LABEL: madak_2_use_f32: ; GFX10-MAD: ; %bb.0: -; GFX10-MAD-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-MAD-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-MAD-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) -; GFX10-MAD-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc +; GFX10-MAD-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc dlc ; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) -; GFX10-MAD-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc dlc +; GFX10-MAD-NEXT: global_load_dword v3, v0, s[6:7] offset:8 glc dlc ; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) ; GFX10-MAD-NEXT: v_madak_f32 v2, v1, v2, 0x41200000 ; GFX10-MAD-NEXT: v_madak_f32 v1, v1, v3, 0x41200000 -; GFX10-MAD-NEXT: global_store_dword v0, v2, s[0:1] +; GFX10-MAD-NEXT: global_store_dword v0, v2, s[4:5] ; GFX10-MAD-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-MAD-NEXT: global_store_dword v0, v1, s[2:3] offset:4 +; GFX10-MAD-NEXT: global_store_dword v0, v1, s[6:7] offset:4 ; GFX10-MAD-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-MAD-NEXT: s_endpgm ; @@ -282,40 +282,40 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad ; ; GFX940-FMA-LABEL: madak_2_use_f32: ; GFX940-FMA: ; %bb.0: -; GFX940-FMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-FMA-NEXT: v_mov_b32_e32 v4, 0x41200000 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-FMA-NEXT: global_load_dword v1, v0, s[2:3] sc0 sc1 +; GFX940-FMA-NEXT: global_load_dword v1, v0, s[6:7] sc0 sc1 ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX940-FMA-NEXT: global_load_dword v2, v0, s[2:3] offset:4 sc0 sc1 +; GFX940-FMA-NEXT: global_load_dword v2, v0, s[6:7] offset:4 sc0 sc1 ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX940-FMA-NEXT: global_load_dword v3, v0, s[2:3] offset:8 sc0 sc1 +; GFX940-FMA-NEXT: global_load_dword v3, v0, s[6:7] offset:8 sc0 sc1 ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX940-FMA-NEXT: v_fmaak_f32 v2, v1, v2, 0x41200000 ; GFX940-FMA-NEXT: v_fmac_f32_e32 v4, v1, v3 -; GFX940-FMA-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-FMA-NEXT: global_store_dword v0, v2, s[4:5] sc0 sc1 ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX940-FMA-NEXT: global_store_dword v0, v4, s[2:3] offset:4 sc0 sc1 +; GFX940-FMA-NEXT: global_store_dword v0, v4, s[6:7] offset:4 sc0 sc1 ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX940-FMA-NEXT: s_endpgm ; ; GFX10-FMA-LABEL: madak_2_use_f32: ; GFX10-FMA: ; %bb.0: -; GFX10-FMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-FMA-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-FMA-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX10-FMA-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc +; GFX10-FMA-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc dlc ; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX10-FMA-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc dlc +; GFX10-FMA-NEXT: global_load_dword v3, v0, s[6:7] offset:8 glc dlc ; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX10-FMA-NEXT: v_fmaak_f32 v2, v1, v2, 0x41200000 ; GFX10-FMA-NEXT: v_fmaak_f32 v1, v1, v3, 0x41200000 -; GFX10-FMA-NEXT: global_store_dword v0, v2, s[0:1] +; GFX10-FMA-NEXT: global_store_dword v0, v2, s[4:5] ; GFX10-FMA-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-FMA-NEXT: global_store_dword v0, v1, s[2:3] offset:4 +; GFX10-FMA-NEXT: global_store_dword v0, v1, s[6:7] offset:4 ; GFX10-FMA-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-FMA-NEXT: s_endpgm ; @@ -398,24 +398,24 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out, ; ; GFX9-LABEL: madak_m_inline_imm_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_madak_f32 v1, 4.0, v1, 0x41200000 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-MAD-LABEL: madak_m_inline_imm_f32: ; GFX10-MAD: ; %bb.0: -; GFX10-MAD-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-MAD-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-MAD-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) ; GFX10-MAD-NEXT: v_madak_f32 v1, 4.0, v1, 0x41200000 -; GFX10-MAD-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-MAD-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-MAD-NEXT: s_endpgm ; ; GFX11-MAD-LABEL: madak_m_inline_imm_f32: @@ -435,24 +435,24 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out, ; ; GFX940-FMA-LABEL: madak_m_inline_imm_f32: ; GFX940-FMA: ; %bb.0: -; GFX940-FMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-FMA-NEXT: global_load_dword v1, v0, s[2:3] +; GFX940-FMA-NEXT: global_load_dword v1, v0, s[6:7] ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX940-FMA-NEXT: v_fmaak_f32 v1, 4.0, v1, 0x41200000 -; GFX940-FMA-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-FMA-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 ; GFX940-FMA-NEXT: s_endpgm ; ; GFX10-FMA-LABEL: madak_m_inline_imm_f32: ; GFX10-FMA: ; %bb.0: -; GFX10-FMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-FMA-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-FMA-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX10-FMA-NEXT: v_fmaak_f32 v1, 4.0, v1, 0x41200000 -; GFX10-FMA-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-FMA-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-FMA-NEXT: s_endpgm ; ; GFX11-FMA-LABEL: madak_m_inline_imm_f32: @@ -931,23 +931,23 @@ define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float ; ; GFX9-LABEL: s_s_madak_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x41200000 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NEXT: v_mac_f32_e32 v1, s2, v2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: v_mac_f32_e32 v1, s6, v2 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-MAD-LABEL: s_s_madak_f32: ; GFX10-MAD: ; %bb.0: -; GFX10-MAD-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-MAD-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-MAD-NEXT: v_mov_b32_e32 v0, s3 -; GFX10-MAD-NEXT: v_madak_f32 v0, s2, v0, 0x41200000 -; GFX10-MAD-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-MAD-NEXT: v_mov_b32_e32 v0, s7 +; GFX10-MAD-NEXT: v_madak_f32 v0, s6, v0, 0x41200000 +; GFX10-MAD-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-MAD-NEXT: s_endpgm ; ; GFX11-MAD-LABEL: s_s_madak_f32: @@ -964,23 +964,23 @@ define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float ; ; GFX940-FMA-LABEL: s_s_madak_f32: ; GFX940-FMA: ; %bb.0: -; GFX940-FMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX940-FMA-NEXT: v_mov_b32_e32 v1, 0x41200000 ; GFX940-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-FMA-NEXT: v_mov_b32_e32 v2, s3 -; GFX940-FMA-NEXT: v_fmac_f32_e32 v1, s2, v2 -; GFX940-FMA-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-FMA-NEXT: v_mov_b32_e32 v2, s7 +; GFX940-FMA-NEXT: v_fmac_f32_e32 v1, s6, v2 +; GFX940-FMA-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 ; GFX940-FMA-NEXT: s_endpgm ; ; GFX10-FMA-LABEL: s_s_madak_f32: ; GFX10-FMA: ; %bb.0: -; GFX10-FMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-FMA-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-FMA-NEXT: v_mov_b32_e32 v0, s3 -; GFX10-FMA-NEXT: v_fmaak_f32 v0, s2, v0, 0x41200000 -; GFX10-FMA-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-FMA-NEXT: v_mov_b32_e32 v0, s7 +; GFX10-FMA-NEXT: v_fmaak_f32 v0, s6, v0, 0x41200000 +; GFX10-FMA-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-FMA-NEXT: s_endpgm ; ; GFX11-FMA-LABEL: s_s_madak_f32: diff --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll b/llvm/test/CodeGen/AMDGPU/memory_clause.ll index 940287d44d8d1..86a5055ab0704 100644 --- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll +++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll @@ -5,21 +5,21 @@ define amdgpu_kernel void @vector_clause(ptr addrspace(1) noalias nocapture readonly %arg, ptr addrspace(1) noalias nocapture %arg1) { ; GCN-LABEL: vector_clause: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v16, 4, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] -; GCN-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16 -; GCN-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32 -; GCN-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48 +; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[4:5] +; GCN-NEXT: global_load_dwordx4 v[4:7], v16, s[4:5] offset:16 +; GCN-NEXT: global_load_dwordx4 v[8:11], v16, s[4:5] offset:32 +; GCN-NEXT: global_load_dwordx4 v[12:15], v16, s[4:5] offset:48 ; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] +; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:16 +; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16 ; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:32 +; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32 ; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:48 +; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48 ; GCN-NEXT: s_endpgm ; ; GCN-SCRATCH-LABEL: vector_clause: diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll index 9d6e0927b0dfd..c98cfa08160ca 100644 --- a/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll +++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll @@ -8,17 +8,17 @@ declare i64 @llvm.cttz.i64(i64, i1) nounwind readnone define amdgpu_kernel void @ctlz_i64_poison(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; GFX9-LABEL: ctlz_i64_poison: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5 -; GFX9-NEXT: global_load_ubyte v2, v1, s[2:3] offset:6 -; GFX9-NEXT: global_load_ubyte v3, v1, s[2:3] offset:7 -; GFX9-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1 -; GFX9-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3 -; GFX9-NEXT: global_load_ubyte v6, v1, s[2:3] offset:4 -; GFX9-NEXT: global_load_ubyte v7, v1, s[2:3] -; GFX9-NEXT: global_load_ubyte v8, v1, s[2:3] offset:2 +; GFX9-NEXT: global_load_ubyte v0, v1, s[6:7] offset:5 +; GFX9-NEXT: global_load_ubyte v2, v1, s[6:7] offset:6 +; GFX9-NEXT: global_load_ubyte v3, v1, s[6:7] offset:7 +; GFX9-NEXT: global_load_ubyte v4, v1, s[6:7] offset:1 +; GFX9-NEXT: global_load_ubyte v5, v1, s[6:7] offset:3 +; GFX9-NEXT: global_load_ubyte v6, v1, s[6:7] offset:4 +; GFX9-NEXT: global_load_ubyte v7, v1, s[6:7] +; GFX9-NEXT: global_load_ubyte v8, v1, s[6:7] offset:2 ; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(5) @@ -40,23 +40,23 @@ define amdgpu_kernel void @ctlz_i64_poison(ptr addrspace(1) noalias %out, ptr ad ; GFX9-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX9-NEXT: v_add_u32_e64 v2, v2, 32 clamp ; GFX9-NEXT: v_min_u32_e32 v0, v2, v0 -; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: ctlz_i64_poison: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x7 -; GFX10-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5 -; GFX10-NEXT: global_load_ubyte v2, v1, s[2:3] offset:6 -; GFX10-NEXT: global_load_ubyte v3, v1, s[2:3] offset:7 -; GFX10-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1 -; GFX10-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3 -; GFX10-NEXT: global_load_ubyte v6, v1, s[2:3] -; GFX10-NEXT: global_load_ubyte v7, v1, s[2:3] offset:2 -; GFX10-NEXT: global_load_ubyte v8, v1, s[2:3] offset:4 +; GFX10-NEXT: global_load_ubyte v0, v1, s[6:7] offset:5 +; GFX10-NEXT: global_load_ubyte v2, v1, s[6:7] offset:6 +; GFX10-NEXT: global_load_ubyte v3, v1, s[6:7] offset:7 +; GFX10-NEXT: global_load_ubyte v4, v1, s[6:7] offset:1 +; GFX10-NEXT: global_load_ubyte v5, v1, s[6:7] offset:3 +; GFX10-NEXT: global_load_ubyte v6, v1, s[6:7] +; GFX10-NEXT: global_load_ubyte v7, v1, s[6:7] offset:2 +; GFX10-NEXT: global_load_ubyte v8, v1, s[6:7] offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(7) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX10-NEXT: s_waitcnt vmcnt(5) @@ -76,7 +76,7 @@ define amdgpu_kernel void @ctlz_i64_poison(ptr addrspace(1) noalias %out, ptr ad ; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX10-NEXT: v_add_nc_u32_e64 v2, v2, 32 clamp ; GFX10-NEXT: v_min_u32_e32 v0, v2, v0 -; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm %val = load i64, ptr addrspace(1) %arrayidx, align 1 %ctlz = tail call i64 @llvm.ctlz.i64(i64 %val, i1 true) nounwind readnone @@ -87,17 +87,17 @@ define amdgpu_kernel void @ctlz_i64_poison(ptr addrspace(1) noalias %out, ptr ad define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; GFX9-LABEL: ctlz_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5 -; GFX9-NEXT: global_load_ubyte v2, v1, s[2:3] offset:6 -; GFX9-NEXT: global_load_ubyte v3, v1, s[2:3] offset:7 -; GFX9-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1 -; GFX9-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3 -; GFX9-NEXT: global_load_ubyte v6, v1, s[2:3] offset:4 -; GFX9-NEXT: global_load_ubyte v7, v1, s[2:3] -; GFX9-NEXT: global_load_ubyte v8, v1, s[2:3] offset:2 +; GFX9-NEXT: global_load_ubyte v0, v1, s[6:7] offset:5 +; GFX9-NEXT: global_load_ubyte v2, v1, s[6:7] offset:6 +; GFX9-NEXT: global_load_ubyte v3, v1, s[6:7] offset:7 +; GFX9-NEXT: global_load_ubyte v4, v1, s[6:7] offset:1 +; GFX9-NEXT: global_load_ubyte v5, v1, s[6:7] offset:3 +; GFX9-NEXT: global_load_ubyte v6, v1, s[6:7] offset:4 +; GFX9-NEXT: global_load_ubyte v7, v1, s[6:7] +; GFX9-NEXT: global_load_ubyte v8, v1, s[6:7] offset:2 ; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(5) @@ -120,23 +120,23 @@ define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace ; GFX9-NEXT: v_add_u32_e64 v2, v2, 32 clamp ; GFX9-NEXT: v_min_u32_e32 v0, v2, v0 ; GFX9-NEXT: v_min_u32_e32 v0, 64, v0 -; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: ctlz_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x7 -; GFX10-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5 -; GFX10-NEXT: global_load_ubyte v2, v1, s[2:3] offset:6 -; GFX10-NEXT: global_load_ubyte v3, v1, s[2:3] offset:7 -; GFX10-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1 -; GFX10-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3 -; GFX10-NEXT: global_load_ubyte v6, v1, s[2:3] -; GFX10-NEXT: global_load_ubyte v7, v1, s[2:3] offset:2 -; GFX10-NEXT: global_load_ubyte v8, v1, s[2:3] offset:4 +; GFX10-NEXT: global_load_ubyte v0, v1, s[6:7] offset:5 +; GFX10-NEXT: global_load_ubyte v2, v1, s[6:7] offset:6 +; GFX10-NEXT: global_load_ubyte v3, v1, s[6:7] offset:7 +; GFX10-NEXT: global_load_ubyte v4, v1, s[6:7] offset:1 +; GFX10-NEXT: global_load_ubyte v5, v1, s[6:7] offset:3 +; GFX10-NEXT: global_load_ubyte v6, v1, s[6:7] +; GFX10-NEXT: global_load_ubyte v7, v1, s[6:7] offset:2 +; GFX10-NEXT: global_load_ubyte v8, v1, s[6:7] offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(7) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX10-NEXT: s_waitcnt vmcnt(5) @@ -157,7 +157,7 @@ define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace ; GFX10-NEXT: v_add_nc_u32_e64 v2, v2, 32 clamp ; GFX10-NEXT: v_min_u32_e32 v0, v2, v0 ; GFX10-NEXT: v_min_u32_e32 v0, 64, v0 -; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm %val = load i64, ptr addrspace(1) %arrayidx, align 1 %ctlz = tail call i64 @llvm.ctlz.i64(i64 %val, i1 false) nounwind readnone @@ -168,17 +168,17 @@ define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace define amdgpu_kernel void @cttz_i64_poison(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; GFX9-LABEL: cttz_i64_poison: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5 -; GFX9-NEXT: global_load_ubyte v2, v1, s[2:3] offset:6 -; GFX9-NEXT: global_load_ubyte v3, v1, s[2:3] offset:7 -; GFX9-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1 -; GFX9-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3 -; GFX9-NEXT: global_load_ubyte v6, v1, s[2:3] offset:4 -; GFX9-NEXT: global_load_ubyte v7, v1, s[2:3] -; GFX9-NEXT: global_load_ubyte v8, v1, s[2:3] offset:2 +; GFX9-NEXT: global_load_ubyte v0, v1, s[6:7] offset:5 +; GFX9-NEXT: global_load_ubyte v2, v1, s[6:7] offset:6 +; GFX9-NEXT: global_load_ubyte v3, v1, s[6:7] offset:7 +; GFX9-NEXT: global_load_ubyte v4, v1, s[6:7] offset:1 +; GFX9-NEXT: global_load_ubyte v5, v1, s[6:7] offset:3 +; GFX9-NEXT: global_load_ubyte v6, v1, s[6:7] offset:4 +; GFX9-NEXT: global_load_ubyte v7, v1, s[6:7] +; GFX9-NEXT: global_load_ubyte v8, v1, s[6:7] offset:2 ; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(5) @@ -200,23 +200,23 @@ define amdgpu_kernel void @cttz_i64_poison(ptr addrspace(1) noalias %out, ptr ad ; GFX9-NEXT: v_ffbl_b32_e32 v2, v2 ; GFX9-NEXT: v_add_u32_e64 v0, v0, 32 clamp ; GFX9-NEXT: v_min_u32_e32 v0, v0, v2 -; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: cttz_i64_poison: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x7 -; GFX10-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5 -; GFX10-NEXT: global_load_ubyte v2, v1, s[2:3] offset:7 -; GFX10-NEXT: global_load_ubyte v3, v1, s[2:3] offset:6 -; GFX10-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1 -; GFX10-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3 -; GFX10-NEXT: global_load_ubyte v6, v1, s[2:3] offset:4 -; GFX10-NEXT: global_load_ubyte v7, v1, s[2:3] -; GFX10-NEXT: global_load_ubyte v8, v1, s[2:3] offset:2 +; GFX10-NEXT: global_load_ubyte v0, v1, s[6:7] offset:5 +; GFX10-NEXT: global_load_ubyte v2, v1, s[6:7] offset:7 +; GFX10-NEXT: global_load_ubyte v3, v1, s[6:7] offset:6 +; GFX10-NEXT: global_load_ubyte v4, v1, s[6:7] offset:1 +; GFX10-NEXT: global_load_ubyte v5, v1, s[6:7] offset:3 +; GFX10-NEXT: global_load_ubyte v6, v1, s[6:7] offset:4 +; GFX10-NEXT: global_load_ubyte v7, v1, s[6:7] +; GFX10-NEXT: global_load_ubyte v8, v1, s[6:7] offset:2 ; GFX10-NEXT: s_waitcnt vmcnt(7) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX10-NEXT: s_waitcnt vmcnt(6) @@ -238,7 +238,7 @@ define amdgpu_kernel void @cttz_i64_poison(ptr addrspace(1) noalias %out, ptr ad ; GFX10-NEXT: v_ffbl_b32_e32 v2, v2 ; GFX10-NEXT: v_add_nc_u32_e64 v0, v0, 32 clamp ; GFX10-NEXT: v_min_u32_e32 v0, v0, v2 -; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm %val = load i64, ptr addrspace(1) %arrayidx, align 1 %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) nounwind readnone @@ -249,17 +249,17 @@ define amdgpu_kernel void @cttz_i64_poison(ptr addrspace(1) noalias %out, ptr ad define amdgpu_kernel void @cttz_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; GFX9-LABEL: cttz_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5 -; GFX9-NEXT: global_load_ubyte v2, v1, s[2:3] offset:6 -; GFX9-NEXT: global_load_ubyte v3, v1, s[2:3] offset:7 -; GFX9-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1 -; GFX9-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3 -; GFX9-NEXT: global_load_ubyte v6, v1, s[2:3] offset:4 -; GFX9-NEXT: global_load_ubyte v7, v1, s[2:3] -; GFX9-NEXT: global_load_ubyte v8, v1, s[2:3] offset:2 +; GFX9-NEXT: global_load_ubyte v0, v1, s[6:7] offset:5 +; GFX9-NEXT: global_load_ubyte v2, v1, s[6:7] offset:6 +; GFX9-NEXT: global_load_ubyte v3, v1, s[6:7] offset:7 +; GFX9-NEXT: global_load_ubyte v4, v1, s[6:7] offset:1 +; GFX9-NEXT: global_load_ubyte v5, v1, s[6:7] offset:3 +; GFX9-NEXT: global_load_ubyte v6, v1, s[6:7] offset:4 +; GFX9-NEXT: global_load_ubyte v7, v1, s[6:7] +; GFX9-NEXT: global_load_ubyte v8, v1, s[6:7] offset:2 ; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(5) @@ -282,23 +282,23 @@ define amdgpu_kernel void @cttz_i64(ptr addrspace(1) noalias %out, ptr addrspace ; GFX9-NEXT: v_add_u32_e64 v0, v0, 32 clamp ; GFX9-NEXT: v_min_u32_e32 v0, v0, v2 ; GFX9-NEXT: v_min_u32_e32 v0, 64, v0 -; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: cttz_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x7 -; GFX10-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5 -; GFX10-NEXT: global_load_ubyte v2, v1, s[2:3] offset:7 -; GFX10-NEXT: global_load_ubyte v3, v1, s[2:3] offset:6 -; GFX10-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1 -; GFX10-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3 -; GFX10-NEXT: global_load_ubyte v6, v1, s[2:3] offset:4 -; GFX10-NEXT: global_load_ubyte v7, v1, s[2:3] -; GFX10-NEXT: global_load_ubyte v8, v1, s[2:3] offset:2 +; GFX10-NEXT: global_load_ubyte v0, v1, s[6:7] offset:5 +; GFX10-NEXT: global_load_ubyte v2, v1, s[6:7] offset:7 +; GFX10-NEXT: global_load_ubyte v3, v1, s[6:7] offset:6 +; GFX10-NEXT: global_load_ubyte v4, v1, s[6:7] offset:1 +; GFX10-NEXT: global_load_ubyte v5, v1, s[6:7] offset:3 +; GFX10-NEXT: global_load_ubyte v6, v1, s[6:7] offset:4 +; GFX10-NEXT: global_load_ubyte v7, v1, s[6:7] +; GFX10-NEXT: global_load_ubyte v8, v1, s[6:7] offset:2 ; GFX10-NEXT: s_waitcnt vmcnt(7) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX10-NEXT: s_waitcnt vmcnt(6) @@ -321,7 +321,7 @@ define amdgpu_kernel void @cttz_i64(ptr addrspace(1) noalias %out, ptr addrspace ; GFX10-NEXT: v_add_nc_u32_e64 v0, v0, 32 clamp ; GFX10-NEXT: v_min_u32_e32 v0, v0, v2 ; GFX10-NEXT: v_min_u32_e32 v0, 64, v0 -; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm %val = load i64, ptr addrspace(1) %arrayidx, align 1 %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 false) nounwind readnone diff --git a/llvm/test/CodeGen/AMDGPU/mul_int24.ll b/llvm/test/CodeGen/AMDGPU/mul_int24.ll index 357b851a8f56f..58fd4b9bd2fee 100644 --- a/llvm/test/CodeGen/AMDGPU/mul_int24.ll +++ b/llvm/test/CodeGen/AMDGPU/mul_int24.ll @@ -39,17 +39,17 @@ define amdgpu_kernel void @test_smul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b ; ; GFX9-LABEL: test_smul24_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 -; GFX9-NEXT: s_bfe_i32 s0, s2, 0x180000 -; GFX9-NEXT: s_bfe_i32 s1, s3, 0x180000 -; GFX9-NEXT: s_mul_i32 s0, s0, s1 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_bfe_i32 s4, s6, 0x180000 +; GFX9-NEXT: s_bfe_i32 s5, s7, 0x180000 +; GFX9-NEXT: s_mul_i32 s4, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: test_smul24_i32: @@ -126,17 +126,17 @@ define amdgpu_kernel void @test_smulhi24_i64(ptr addrspace(1) %out, i32 %a, i32 ; ; GFX9-LABEL: test_smulhi24_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 -; GFX9-NEXT: s_bfe_i32 s0, s2, 0x180000 -; GFX9-NEXT: s_bfe_i32 s1, s3, 0x180000 -; GFX9-NEXT: s_mul_hi_i32 s0, s0, s1 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_bfe_i32 s4, s6, 0x180000 +; GFX9-NEXT: s_bfe_i32 s5, s7, 0x180000 +; GFX9-NEXT: s_mul_hi_i32 s4, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: test_smulhi24_i64: diff --git a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll index 3a16c88f32cc3..698a54de108f7 100644 --- a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll +++ b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll @@ -39,17 +39,17 @@ define amdgpu_kernel void @test_umul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b ; ; GFX9-LABEL: test_umul24_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 -; GFX9-NEXT: s_and_b32 s0, s2, 0xffffff -; GFX9-NEXT: s_and_b32 s1, s3, 0xffffff -; GFX9-NEXT: s_mul_i32 s0, s0, s1 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_and_b32 s4, s6, 0xffffff +; GFX9-NEXT: s_and_b32 s5, s7, 0xffffff +; GFX9-NEXT: s_mul_i32 s4, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm entry: %0 = shl i32 %a, 8 @@ -405,17 +405,17 @@ define amdgpu_kernel void @test_umulhi24_i32_i64(ptr addrspace(1) %out, i32 %a, ; ; GFX9-LABEL: test_umulhi24_i32_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 -; GFX9-NEXT: s_and_b32 s0, s2, 0xffffff -; GFX9-NEXT: s_and_b32 s1, s3, 0xffffff -; GFX9-NEXT: s_mul_hi_u32 s0, s0, s1 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_and_b32 s4, s6, 0xffffff +; GFX9-NEXT: s_and_b32 s5, s7, 0xffffff +; GFX9-NEXT: s_mul_hi_u32 s4, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm entry: %a.24 = and i32 %a, 16777215 @@ -663,14 +663,14 @@ define amdgpu_kernel void @test_umulhi16_i32(ptr addrspace(1) %out, i32 %a, i32 ; ; GFX9-LABEL: test_umulhi16_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s2, s2, 0xffff -; GFX9-NEXT: s_and_b32 s3, s3, 0xffff -; GFX9-NEXT: s_mul_i32 s2, s2, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_short_d16_hi v0, v1, s[0:1] +; GFX9-NEXT: s_and_b32 s0, s6, 0xffff +; GFX9-NEXT: s_and_b32 s1, s7, 0xffff +; GFX9-NEXT: s_mul_i32 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_short_d16_hi v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm entry: %a.16 = and i32 %a, 65535 diff --git a/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll b/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll index c72a7ba3eee83..79082a54c6a36 100644 --- a/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll +++ b/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll @@ -4,17 +4,17 @@ define amdgpu_kernel void @fma_vector_vector_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 { ; GCN-LABEL: fma_vector_vector_scalar_lo: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NEXT: ds_read_b32 v2, v0 ; GCN-NEXT: ds_read_b32 v0, v0 offset:4 ; GCN-NEXT: ds_read_u16 v1, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] -; GCN-NEXT: global_store_dword v3, v0, s[0:1] +; GCN-NEXT: global_store_dword v3, v0, s[4:5] ; GCN-NEXT: s_endpgm bb: %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1 @@ -35,17 +35,17 @@ bb: define amdgpu_kernel void @fma_vector_vector_neg_broadcast_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 { ; GCN-LABEL: fma_vector_vector_neg_broadcast_scalar_lo: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NEXT: ds_read_b32 v2, v0 ; GCN-NEXT: ds_read_b32 v0, v0 offset:4 ; GCN-NEXT: ds_read_u16 v1, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1] -; GCN-NEXT: global_store_dword v3, v0, s[0:1] +; GCN-NEXT: global_store_dword v3, v0, s[4:5] ; GCN-NEXT: s_endpgm bb: %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1 @@ -67,17 +67,17 @@ bb: define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 { ; GCN-LABEL: fma_vector_vector_neg_scalar_lo: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NEXT: ds_read_b32 v2, v0 ; GCN-NEXT: ds_read_b32 v0, v0 offset:4 ; GCN-NEXT: ds_read_u16 v1, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1] -; GCN-NEXT: global_store_dword v3, v0, s[0:1] +; GCN-NEXT: global_store_dword v3, v0, s[4:5] ; GCN-NEXT: s_endpgm bb: %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1 @@ -99,17 +99,17 @@ bb: define amdgpu_kernel void @fma_vector_vector_neg_broadcast_neg_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 { ; GCN-LABEL: fma_vector_vector_neg_broadcast_neg_scalar_lo: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NEXT: ds_read_b32 v2, v0 ; GCN-NEXT: ds_read_b32 v0, v0 offset:4 ; GCN-NEXT: ds_read_u16 v1, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] -; GCN-NEXT: global_store_dword v3, v0, s[0:1] +; GCN-NEXT: global_store_dword v3, v0, s[4:5] ; GCN-NEXT: s_endpgm bb: %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1 @@ -132,17 +132,17 @@ bb: define amdgpu_kernel void @fma_vector_vector_scalar_neg_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 { ; GCN-LABEL: fma_vector_vector_scalar_neg_lo: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NEXT: ds_read_b32 v2, v0 ; GCN-NEXT: ds_read_b32 v0, v0 offset:4 ; GCN-NEXT: ds_read_u16 v1, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] neg_lo:[0,0,1] -; GCN-NEXT: global_store_dword v3, v0, s[0:1] +; GCN-NEXT: global_store_dword v3, v0, s[4:5] ; GCN-NEXT: s_endpgm bb: %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1 @@ -163,17 +163,17 @@ bb: define amdgpu_kernel void @fma_vector_vector_scalar_neg_hi(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 { ; GCN-LABEL: fma_vector_vector_scalar_neg_hi: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NEXT: ds_read_b32 v2, v0 ; GCN-NEXT: ds_read_b32 v0, v0 offset:4 ; GCN-NEXT: ds_read_u16 v1, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] neg_hi:[0,0,1] -; GCN-NEXT: global_store_dword v3, v0, s[0:1] +; GCN-NEXT: global_store_dword v3, v0, s[4:5] ; GCN-NEXT: s_endpgm bb: %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1 @@ -194,16 +194,16 @@ bb: define amdgpu_kernel void @add_vector_neg_bitcast_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 { ; GCN-LABEL: add_vector_neg_bitcast_scalar_lo: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NEXT: ds_read_b32 v0, v0 ; GCN-NEXT: ds_read_u16 v1, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_pk_add_u16 v0, v0, v1 op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1] -; GCN-NEXT: global_store_dword v2, v0, s[0:1] +; GCN-NEXT: global_store_dword v2, v0, s[4:5] ; GCN-NEXT: s_endpgm bb: %vec0 = load volatile <2 x i16>, ptr addrspace(3) %lds, align 4 @@ -222,11 +222,11 @@ bb: define amdgpu_kernel void @fma_vector_vector_scalar_lo_neg_scalar_hi(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 { ; GCN-LABEL: fma_vector_vector_scalar_lo_neg_scalar_hi: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NEXT: ds_read_b32 v2, v0 ; GCN-NEXT: ds_read_b32 v0, v0 offset:4 ; GCN-NEXT: ds_read_u16 v3, v1 @@ -237,7 +237,7 @@ define amdgpu_kernel void @fma_vector_vector_scalar_lo_neg_scalar_hi(ptr addrspa ; GCN-NEXT: v_xor_b32_e32 v1, 0x8000, v1 ; GCN-NEXT: v_lshl_or_b32 v1, v1, 16, v3 ; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v1 -; GCN-NEXT: global_store_dword v4, v0, s[0:1] +; GCN-NEXT: global_store_dword v4, v0, s[4:5] ; GCN-NEXT: s_endpgm bb: %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1 @@ -261,10 +261,10 @@ bb: define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo_scalar_hi(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 { ; GCN-LABEL: fma_vector_vector_neg_scalar_lo_scalar_hi: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NEXT: ds_read_b32 v2, v0 ; GCN-NEXT: ds_read_b32 v0, v0 offset:4 ; GCN-NEXT: ds_read_u16 v3, v1 @@ -273,7 +273,7 @@ define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo_scalar_hi(ptr addrspa ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v3 neg_lo:[0,0,1] neg_hi:[0,0,1] -; GCN-NEXT: global_store_dword v1, v0, s[0:1] +; GCN-NEXT: global_store_dword v1, v0, s[4:5] ; GCN-NEXT: s_endpgm bb: %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1 diff --git a/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll b/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll index ce92d40cca2b0..2ce0b9eed02cb 100644 --- a/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll +++ b/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll @@ -50,40 +50,40 @@ define amdgpu_kernel void @buffers_dont_alias(ptr addrspace(8) noalias %a, ptr a define amdgpu_kernel void @buffers_from_flat_dont_alias(ptr noalias %a.flat, ptr noalias %b.flat) { ; SDAG-LABEL: buffers_from_flat_dont_alias: ; SDAG: ; %bb.0: -; SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; SDAG-NEXT: s_mov_b32 s7, 0 -; SDAG-NEXT: s_mov_b32 s6, 16 +; SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; SDAG-NEXT: s_mov_b32 s3, 0 +; SDAG-NEXT: s_mov_b32 s2, 16 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: s_and_b32 s5, s1, 0xffff -; SDAG-NEXT: s_mov_b32 s4, s0 -; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; SDAG-NEXT: s_and_b32 s5, s3, 0xffff -; SDAG-NEXT: s_mov_b32 s4, s2 +; SDAG-NEXT: s_and_b32 s1, s5, 0xffff +; SDAG-NEXT: s_mov_b32 s0, s4 +; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; SDAG-NEXT: s_and_b32 s1, s7, 0xffff +; SDAG-NEXT: s_mov_b32 s0, s6 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: v_mul_f32_e32 v0, v0, v0 ; SDAG-NEXT: v_mul_f32_e32 v1, v1, v1 ; SDAG-NEXT: v_mul_f32_e32 v2, v2, v2 ; SDAG-NEXT: v_mul_f32_e32 v3, v3, v3 -; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: buffers_from_flat_dont_alias: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GISEL-NEXT: s_mov_b32 s7, 0 -; GISEL-NEXT: s_mov_b32 s6, 16 +; GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GISEL-NEXT: s_mov_b32 s3, 0 +; GISEL-NEXT: s_mov_b32 s2, 16 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: s_and_b32 s5, s1, 0xffff -; GISEL-NEXT: s_mov_b32 s4, s0 -; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; GISEL-NEXT: s_and_b32 s5, s3, 0xffff -; GISEL-NEXT: s_mov_b32 s4, s2 +; GISEL-NEXT: s_and_b32 s1, s5, 0xffff +; GISEL-NEXT: s_mov_b32 s0, s4 +; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; GISEL-NEXT: s_and_b32 s1, s7, 0xffff +; GISEL-NEXT: s_mov_b32 s0, s6 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: v_mul_f32_e32 v0, v0, v0 ; GISEL-NEXT: v_mul_f32_e32 v1, v1, v1 ; GISEL-NEXT: v_mul_f32_e32 v2, v2, v2 ; GISEL-NEXT: v_mul_f32_e32 v3, v3, v3 -; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GISEL-NEXT: s_endpgm %a = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %a.flat, i16 0, i32 16, i32 0) %b = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %b.flat, i16 0, i32 16, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/rotl.ll b/llvm/test/CodeGen/AMDGPU/rotl.ll index a87973d93ac77..fcccd2da07f76 100644 --- a/llvm/test/CodeGen/AMDGPU/rotl.ll +++ b/llvm/test/CodeGen/AMDGPU/rotl.ll @@ -47,12 +47,12 @@ define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX10-LABEL: rotl_i32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_sub_i32 s3, 32, s3 -; GFX10-NEXT: v_alignbit_b32 v1, s2, s2, s3 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_sub_i32 s0, 32, s7 +; GFX10-NEXT: v_alignbit_b32 v1, s6, s6, s0 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: rotl_i32: diff --git a/llvm/test/CodeGen/AMDGPU/rotr.ll b/llvm/test/CodeGen/AMDGPU/rotr.ll index 058ee589bc4b0..214894092a8b0 100644 --- a/llvm/test/CodeGen/AMDGPU/rotr.ll +++ b/llvm/test/CodeGen/AMDGPU/rotr.ll @@ -43,11 +43,11 @@ define amdgpu_kernel void @rotr_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX10-LABEL: rotr_i32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v1, s2, s2, s3 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: v_alignbit_b32 v1, s6, s6, s7 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: rotr_i32: diff --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll index b81af3eb838f1..acdcb631dccbd 100644 --- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll @@ -8,15 +8,15 @@ define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 { ; GFX9-LABEL: s_shl_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 -; GFX9-NEXT: v_pk_lshlrev_b16 v0, s3, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: v_pk_lshlrev_b16 v0, s7, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: s_shl_v2i16: @@ -59,14 +59,14 @@ define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 ; ; GFX10-LABEL: s_shl_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_lshlrev_b16 v0, s3, s2 -; GFX10-NEXT: s_mov_b32 s4, s0 -; GFX10-NEXT: s_mov_b32 s5, s1 -; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX10-NEXT: v_pk_lshlrev_b16 v0, s7, s6 +; GFX10-NEXT: s_mov_b32 s0, s4 +; GFX10-NEXT: s_mov_b32 s1, s5 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_shl_v2i16: diff --git a/llvm/test/CodeGen/AMDGPU/sub.ll b/llvm/test/CodeGen/AMDGPU/sub.ll index ded308ae4f230..3ae982089228d 100644 --- a/llvm/test/CodeGen/AMDGPU/sub.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.ll @@ -33,12 +33,12 @@ define amdgpu_kernel void @s_sub_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; ; GFX9-LABEL: s_sub_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sub_i32 s2, s2, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_sub_i32 s0, s6, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: s_sub_i32: diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll index 6ec213a06999b..bfeab97d81dbe 100644 --- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll @@ -214,15 +214,15 @@ define amdgpu_kernel void @s_test_sub_self_v2i16(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @s_test_sub_v2i16_kernarg(ptr addrspace(1) %out, <2 x i16> %a, <2 x i16> %b) #1 { ; GFX9-LABEL: s_test_sub_v2i16_kernarg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 -; GFX9-NEXT: v_pk_sub_i16 v0, s2, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: v_pk_sub_i16 v0, s6, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: s_test_sub_v2i16_kernarg: @@ -246,14 +246,14 @@ define amdgpu_kernel void @s_test_sub_v2i16_kernarg(ptr addrspace(1) %out, <2 x ; ; GFX10-LABEL: s_test_sub_v2i16_kernarg: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_sub_i16 v0, s2, s3 -; GFX10-NEXT: s_mov_b32 s4, s0 -; GFX10-NEXT: s_mov_b32 s5, s1 -; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX10-NEXT: v_pk_sub_i16 v0, s6, s7 +; GFX10-NEXT: s_mov_b32 s0, s4 +; GFX10-NEXT: s_mov_b32 s1, s5 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_sub_v2i16_kernarg: diff --git a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll index fc6df735c05b0..65eb1cee42350 100644 --- a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll +++ b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll @@ -135,13 +135,13 @@ define amdgpu_kernel void @v_cnd_nan(ptr addrspace(1) %out, i32 %c, float %f) #0 ; ; GFX10-LABEL: v_cnd_nan: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_cmp_eq_u32 s2, 0 -; GFX10-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, -1, s3, s[4:5] -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_cmp_eq_u32 s6, 0 +; GFX10-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, -1, s7, s[0:1] +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_cnd_nan: diff --git a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll index 89fef7eead839..e3185e189157b 100644 --- a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll +++ b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll @@ -104,15 +104,15 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg ; ; SDAG-GFX9-LABEL: basic_smax_smin_sgpr: ; SDAG-GFX9: ; %bb.0: -; SDAG-GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; SDAG-GFX9-NEXT: v_mov_b32_e32 v1, 0xff ; SDAG-GFX9-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX9-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX9-NEXT: v_med3_i16 v2, s2, 0, v1 -; SDAG-GFX9-NEXT: v_med3_i16 v1, s3, 0, v1 +; SDAG-GFX9-NEXT: v_med3_i16 v2, s6, 0, v1 +; SDAG-GFX9-NEXT: v_med3_i16 v1, s7, 0, v1 ; SDAG-GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SDAG-GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 -; SDAG-GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; SDAG-GFX9-NEXT: s_endpgm ; ; SDAG-GFX11-LABEL: basic_smax_smin_sgpr: @@ -156,22 +156,22 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg ; ; GISEL-GFX9-LABEL: basic_smax_smin_sgpr: ; GISEL-GFX9: ; %bb.0: -; GISEL-GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GISEL-GFX9-NEXT: s_sext_i32_i16 s4, 0 -; GISEL-GFX9-NEXT: s_sext_i32_i16 s5, 0xff +; GISEL-GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GISEL-GFX9-NEXT: s_sext_i32_i16 s0, 0 +; GISEL-GFX9-NEXT: s_sext_i32_i16 s1, 0xff ; GISEL-GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-GFX9-NEXT: s_sext_i32_i16 s2, s6 +; GISEL-GFX9-NEXT: s_sext_i32_i16 s3, s7 +; GISEL-GFX9-NEXT: s_max_i32 s2, s2, s0 +; GISEL-GFX9-NEXT: s_max_i32 s0, s3, s0 ; GISEL-GFX9-NEXT: s_sext_i32_i16 s2, s2 -; GISEL-GFX9-NEXT: s_sext_i32_i16 s3, s3 -; GISEL-GFX9-NEXT: s_max_i32 s2, s2, s4 -; GISEL-GFX9-NEXT: s_max_i32 s3, s3, s4 -; GISEL-GFX9-NEXT: s_sext_i32_i16 s2, s2 -; GISEL-GFX9-NEXT: s_sext_i32_i16 s3, s3 -; GISEL-GFX9-NEXT: s_min_i32 s2, s2, s5 -; GISEL-GFX9-NEXT: s_min_i32 s3, s3, s5 -; GISEL-GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s3 -; GISEL-GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX9-NEXT: s_sext_i32_i16 s0, s0 +; GISEL-GFX9-NEXT: s_min_i32 s2, s2, s1 +; GISEL-GFX9-NEXT: s_min_i32 s0, s0, s1 +; GISEL-GFX9-NEXT: s_pack_ll_b32_b16 s0, s2, s0 +; GISEL-GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX9-NEXT: global_store_dword v1, v0, s[4:5] ; GISEL-GFX9-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: basic_smax_smin_sgpr: diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll index 901e88a4c6aca..e12a4beb5dbe5 100644 --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -1344,40 +1344,40 @@ exit: define amdgpu_kernel void @fdiv_f32(ptr addrspace(1) %out, float %a, float %b) #0 { ; GFX1032-LABEL: fdiv_f32: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_div_scale_f32 v0, s4, s3, s3, s2 +; GFX1032-NEXT: v_div_scale_f32 v0, s0, s7, s7, s6 ; GFX1032-NEXT: v_rcp_f32_e32 v1, v0 ; GFX1032-NEXT: v_fma_f32 v2, -v0, v1, 1.0 ; GFX1032-NEXT: v_fmac_f32_e32 v1, v2, v1 -; GFX1032-NEXT: v_div_scale_f32 v2, vcc_lo, s2, s3, s2 +; GFX1032-NEXT: v_div_scale_f32 v2, vcc_lo, s6, s7, s6 ; GFX1032-NEXT: v_mul_f32_e32 v3, v2, v1 ; GFX1032-NEXT: v_fma_f32 v4, -v0, v3, v2 ; GFX1032-NEXT: v_fmac_f32_e32 v3, v4, v1 ; GFX1032-NEXT: v_fma_f32 v0, -v0, v3, v2 ; GFX1032-NEXT: v_div_fmas_f32 v0, v0, v1, v3 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: v_div_fixup_f32 v0, v0, s3, s2 -; GFX1032-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032-NEXT: v_div_fixup_f32 v0, v0, s7, s6 +; GFX1032-NEXT: global_store_dword v1, v0, s[4:5] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: fdiv_f32: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_div_scale_f32 v0, s[4:5], s3, s3, s2 +; GFX1064-NEXT: v_div_scale_f32 v0, s[0:1], s7, s7, s6 ; GFX1064-NEXT: v_rcp_f32_e32 v1, v0 ; GFX1064-NEXT: v_fma_f32 v2, -v0, v1, 1.0 ; GFX1064-NEXT: v_fmac_f32_e32 v1, v2, v1 -; GFX1064-NEXT: v_div_scale_f32 v2, vcc, s2, s3, s2 +; GFX1064-NEXT: v_div_scale_f32 v2, vcc, s6, s7, s6 ; GFX1064-NEXT: v_mul_f32_e32 v3, v2, v1 ; GFX1064-NEXT: v_fma_f32 v4, -v0, v3, v2 ; GFX1064-NEXT: v_fmac_f32_e32 v3, v4, v1 ; GFX1064-NEXT: v_fma_f32 v0, -v0, v3, v2 ; GFX1064-NEXT: v_div_fmas_f32 v0, v0, v1, v3 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: v_div_fixup_f32 v0, v0, s3, s2 -; GFX1064-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064-NEXT: v_div_fixup_f32 v0, v0, s7, s6 +; GFX1064-NEXT: global_store_dword v1, v0, s[4:5] ; GFX1064-NEXT: s_endpgm entry: %fdiv = fdiv float %a, %b @@ -2138,23 +2138,23 @@ main_body: define amdgpu_kernel void @test_intr_fcmp_i64(ptr addrspace(1) %out, float %src, float %a) { ; GFX1032-LABEL: test_intr_fcmp_i64: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_cmp_eq_f32_e64 s2, s2, |s3| -; GFX1032-NEXT: v_mov_b32_e32 v0, s2 -; GFX1032-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] +; GFX1032-NEXT: v_cmp_eq_f32_e64 s0, s6, |s7| +; GFX1032-NEXT: v_mov_b32_e32 v0, s0 +; GFX1032-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_intr_fcmp_i64: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, |s3| -; GFX1064-NEXT: v_mov_b32_e32 v0, s2 -; GFX1064-NEXT: v_mov_b32_e32 v1, s3 -; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX1064-NEXT: v_cmp_eq_f32_e64 s[0:1], s6, |s7| +; GFX1064-NEXT: v_mov_b32_e32 v0, s0 +; GFX1064-NEXT: v_mov_b32_e32 v1, s1 +; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX1064-NEXT: s_endpgm %temp = call float @llvm.fabs.f32(float %a) %result = call i64 @llvm.amdgcn.fcmp.i64.f32(float %src, float %temp, i32 1) @@ -2195,22 +2195,22 @@ define amdgpu_kernel void @test_intr_icmp_i64(ptr addrspace(1) %out, i32 %src) { define amdgpu_kernel void @test_intr_fcmp_i32(ptr addrspace(1) %out, float %src, float %a) { ; GFX1032-LABEL: test_intr_fcmp_i32: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_cmp_eq_f32_e64 s2, s2, |s3| -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 -; GFX1032-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1032-NEXT: v_cmp_eq_f32_e64 s0, s6, |s7| +; GFX1032-NEXT: v_mov_b32_e32 v1, s0 +; GFX1032-NEXT: global_store_dword v0, v1, s[4:5] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_intr_fcmp_i32: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, |s3| -; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1064-NEXT: v_cmp_eq_f32_e64 s[0:1], s6, |s7| +; GFX1064-NEXT: v_mov_b32_e32 v1, s0 +; GFX1064-NEXT: global_store_dword v0, v1, s[4:5] ; GFX1064-NEXT: s_endpgm %temp = call float @llvm.fabs.f32(float %a) %result = call i32 @llvm.amdgcn.fcmp.i32.f32(float %src, float %temp, i32 1) From a9fba2c2773ff8cfcb88edbc90325ae49f28527b Mon Sep 17 00:00:00 2001 From: Christudasan Devadasan Date: Wed, 3 Jul 2024 13:28:04 +0000 Subject: [PATCH 6/8] code fixup. --- llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index ea8ad39b90257..ae537b194f50c 100644 --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -1727,7 +1727,7 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, // under-aligned. const MachineMemOperand *MMO = *CI.I->memoperands_begin(); bool NeedsConstrainedOpc = - STM->isXNACKEnabled() && MMO->getAlign().value() < (Width << 2); + STM->isXNACKEnabled() && MMO->getAlign().value() < Width * 4; switch (Width) { default: return 0; From 9a9c5a9fdc7ee05f0c0640e15c12596ade577917 Mon Sep 17 00:00:00 2001 From: Christudasan Devadasan Date: Mon, 8 Jul 2024 05:23:35 +0000 Subject: [PATCH 7/8] Extended merge-s-load.mir to test *_ec sload optimizations. --- .../AMDGPU/GlobalISel/cvt_f32_ubyte.ll | 76 +- .../AMDGPU/GlobalISel/fp-atomics-gfx940.ll | 49 +- .../llvm.amdgcn.global.atomic.csub.ll | 24 +- .../GlobalISel/llvm.amdgcn.intersect_ray.ll | 121 +- .../GlobalISel/llvm.amdgcn.set.inactive.ll | 86 +- .../GlobalISel/llvm.amdgcn.update.dpp.ll | 83 +- .../test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll | 738 ++-- .../test/CodeGen/AMDGPU/GlobalISel/udivrem.ll | 486 +-- llvm/test/CodeGen/AMDGPU/add.v2i16.ll | 224 +- .../AMDGPU/amdgpu-codegenprepare-idiv.ll | 2875 +++++++------- llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll | 48 +- llvm/test/CodeGen/AMDGPU/bfe-patterns.ll | 56 +- llvm/test/CodeGen/AMDGPU/bfm.ll | 16 +- llvm/test/CodeGen/AMDGPU/bitreverse.ll | 162 +- llvm/test/CodeGen/AMDGPU/build_vector.ll | 74 +- .../CodeGen/AMDGPU/calling-conventions.ll | 212 +- llvm/test/CodeGen/AMDGPU/cluster_stores.ll | 12 +- .../CodeGen/AMDGPU/combine-cond-add-sub.ll | 226 +- llvm/test/CodeGen/AMDGPU/ctlz.ll | 270 +- llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll | 387 +- llvm/test/CodeGen/AMDGPU/ctpop16.ll | 88 +- llvm/test/CodeGen/AMDGPU/ctpop64.ll | 124 +- llvm/test/CodeGen/AMDGPU/cttz.ll | 168 +- llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll | 166 +- llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll | 251 +- .../AMDGPU/divergence-driven-buildvector.ll | 152 +- llvm/test/CodeGen/AMDGPU/ds_read2.ll | 243 +- .../CodeGen/AMDGPU/extract_vector_elt-f16.ll | 213 +- llvm/test/CodeGen/AMDGPU/fabs.f16.ll | 167 +- llvm/test/CodeGen/AMDGPU/fabs.ll | 112 +- llvm/test/CodeGen/AMDGPU/fcanonicalize.ll | 512 +-- llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll | 641 ++-- llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll | 337 +- llvm/test/CodeGen/AMDGPU/fdiv.ll | 290 +- llvm/test/CodeGen/AMDGPU/flat_atomics.ll | 3306 ++++++++--------- .../CodeGen/AMDGPU/flat_atomics_i32_system.ll | 258 +- llvm/test/CodeGen/AMDGPU/fma-combine.ll | 788 ++-- .../AMDGPU/fmul-2-combine-multi-use.ll | 324 +- llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll | 356 +- llvm/test/CodeGen/AMDGPU/fnearbyint.ll | 123 +- llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll | 72 +- llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll | 178 +- llvm/test/CodeGen/AMDGPU/fneg-fabs.ll | 48 +- llvm/test/CodeGen/AMDGPU/fneg.ll | 218 +- .../test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll | 50 +- llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll | 82 +- llvm/test/CodeGen/AMDGPU/fp-classify.ll | 322 +- .../AMDGPU/fp-min-max-buffer-atomics.ll | 245 +- .../AMDGPU/fp-min-max-buffer-ptr-atomics.ll | 231 +- llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll | 6 +- llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll | 6 +- llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll | 6 +- .../AMDGPU/fp64-min-max-buffer-atomics.ll | 128 +- .../AMDGPU/fp64-min-max-buffer-ptr-atomics.ll | 128 +- llvm/test/CodeGen/AMDGPU/fp_to_sint.ll | 118 +- llvm/test/CodeGen/AMDGPU/fp_to_uint.ll | 104 +- llvm/test/CodeGen/AMDGPU/fshl.ll | 344 +- llvm/test/CodeGen/AMDGPU/fshr.ll | 208 +- llvm/test/CodeGen/AMDGPU/global_atomics.ll | 2702 +++++++------- .../AMDGPU/global_atomics_i32_system.ll | 244 +- llvm/test/CodeGen/AMDGPU/half.ll | 366 +- .../CodeGen/AMDGPU/insert_vector_dynelt.ll | 734 ++-- .../insert_waitcnt_for_precise_memory.ll | 271 +- llvm/test/CodeGen/AMDGPU/kernel-args.ll | 933 ++--- .../CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll | 192 +- .../CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll | 1112 +++--- .../CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll | 1068 +++--- .../CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll | 732 ++-- .../CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll | 698 ++-- .../CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll | 20 +- .../AMDGPU/llvm.amdgcn.intersect_ray.ll | 85 +- .../CodeGen/AMDGPU/llvm.amdgcn.permlane.ll | 738 ++-- .../AMDGPU/llvm.amdgcn.permlane16.var.ll | 224 +- .../CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll | 117 +- .../AMDGPU/llvm.amdgcn.s.barrier.wait.ll | 363 +- .../llvm.amdgcn.sched.group.barrier.gfx11.ll | 28 +- .../llvm.amdgcn.sched.group.barrier.gfx12.ll | 142 +- .../AMDGPU/llvm.amdgcn.sched.group.barrier.ll | 300 +- .../AMDGPU/llvm.amdgcn.set.inactive.ll | 68 +- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll | 280 +- llvm/test/CodeGen/AMDGPU/llvm.exp.ll | 194 +- llvm/test/CodeGen/AMDGPU/llvm.exp10.ll | 194 +- llvm/test/CodeGen/AMDGPU/llvm.exp2.ll | 139 +- llvm/test/CodeGen/AMDGPU/llvm.log.ll | 254 +- llvm/test/CodeGen/AMDGPU/llvm.log10.ll | 254 +- llvm/test/CodeGen/AMDGPU/llvm.log2.ll | 280 +- .../AMDGPU/llvm.r600.read.local.size.ll | 146 +- llvm/test/CodeGen/AMDGPU/llvm.round.ll | 486 +-- llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll | 156 +- llvm/test/CodeGen/AMDGPU/madak.ll | 516 +-- llvm/test/CodeGen/AMDGPU/memory_clause.ll | 70 +- llvm/test/CodeGen/AMDGPU/merge-s-load.mir | 302 +- llvm/test/CodeGen/AMDGPU/min.ll | 830 +++-- .../CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll | 16 +- llvm/test/CodeGen/AMDGPU/mul.ll | 888 ++--- llvm/test/CodeGen/AMDGPU/mul_int24.ll | 242 +- llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll | 292 +- llvm/test/CodeGen/AMDGPU/or.ll | 376 +- llvm/test/CodeGen/AMDGPU/preload-kernargs.ll | 1762 +++++---- .../AMDGPU/ptr-buffer-alias-scheduling.ll | 16 +- llvm/test/CodeGen/AMDGPU/rotl.ll | 60 +- llvm/test/CodeGen/AMDGPU/rotr.ll | 44 +- llvm/test/CodeGen/AMDGPU/shl.v2i16.ll | 154 +- llvm/test/CodeGen/AMDGPU/sign_extend.ll | 100 +- llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll | 98 +- llvm/test/CodeGen/AMDGPU/sub.ll | 192 +- llvm/test/CodeGen/AMDGPU/sub.v2i16.ll | 204 +- llvm/test/CodeGen/AMDGPU/udiv.ll | 112 +- llvm/test/CodeGen/AMDGPU/v_cndmask.ll | 740 ++-- llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll | 106 +- llvm/test/CodeGen/AMDGPU/wave32.ll | 313 +- llvm/test/CodeGen/AMDGPU/xor.ll | 276 +- 112 files changed, 19657 insertions(+), 18910 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll index eb20178f9f4d8..405b1e8f3a250 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll @@ -452,7 +452,7 @@ define double @v_uitofp_i8_to_f64(i8 %arg0) nounwind { define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_i8_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 @@ -468,7 +468,7 @@ define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr add ; ; VI-LABEL: load_i8_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s2 @@ -493,7 +493,7 @@ define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr add define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v2i8_to_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -513,7 +513,7 @@ define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v2i8_to_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -539,7 +539,7 @@ define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v3i8_to_v3f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -562,7 +562,7 @@ define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v3i8_to_v3f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -589,7 +589,7 @@ define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v4i8_to_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -612,7 +612,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v4i8_to_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -644,7 +644,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v4i8_to_v4f32_unaligned: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -679,7 +679,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias ; ; VI-LABEL: load_v4i8_to_v4f32_unaligned: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -725,14 +725,14 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %out2, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v4i8_to_v4f32_2_uses: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b64 s[4:5], s[0:1] @@ -769,17 +769,17 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; ; VI-LABEL: load_v4i8_to_v4f32_2_uses: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: v_mov_b32_e32 v6, 9 ; VI-NEXT: v_mov_b32_e32 v7, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v1, v[0:1] -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0xff ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v5, s1 @@ -821,7 +821,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v7i8_to_v7f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -858,7 +858,7 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v7i8_to_v7f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -918,7 +918,7 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v8i8_to_v8f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -949,7 +949,7 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v8i8_to_v8f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -986,7 +986,7 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: i8_zext_inreg_i32_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -1005,7 +1005,7 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %ou ; ; VI-LABEL: i8_zext_inreg_i32_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -1033,7 +1033,7 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %ou define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: i8_zext_inreg_hi1_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -1051,7 +1051,7 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %ou ; ; VI-LABEL: i8_zext_inreg_hi1_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -1080,7 +1080,7 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %ou define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: i8_zext_i32_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 @@ -1096,7 +1096,7 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: i8_zext_i32_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s2 @@ -1122,7 +1122,7 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v4i8_zext_v4i32_to_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -1157,7 +1157,7 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou ; ; VI-LABEL: v4i8_zext_v4i32_to_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -1204,7 +1204,7 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: extract_byte0_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -1221,7 +1221,7 @@ define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, p ; ; VI-LABEL: extract_byte0_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -1247,7 +1247,7 @@ define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: extract_byte1_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -1265,7 +1265,7 @@ define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, p ; ; VI-LABEL: extract_byte1_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -1292,7 +1292,7 @@ define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: extract_byte2_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -1310,7 +1310,7 @@ define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, p ; ; VI-LABEL: extract_byte2_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -1337,7 +1337,7 @@ define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: extract_byte3_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -1354,7 +1354,7 @@ define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, p ; ; VI-LABEL: extract_byte3_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -1381,7 +1381,7 @@ define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: cvt_ubyte0_or_multiuse: ; SI: ; %bb.0: ; %bb -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -1401,7 +1401,7 @@ define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addr ; ; VI-LABEL: cvt_ubyte0_or_multiuse: ; VI: ; %bb.0: ; %bb -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll index a018ea5bf18f1..2d3b6ee3e9823 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll @@ -13,10 +13,10 @@ declare <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> % define amdgpu_kernel void @flat_atomic_fadd_f32_noret(ptr %ptr, float %data) { ; GFX940-LABEL: flat_atomic_fadd_f32_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 ; GFX940-NEXT: s_endpgm @@ -27,7 +27,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret(ptr %ptr, float %data) { define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) { ; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -43,7 +43,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) { define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 { ; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat_ieee: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -84,10 +84,10 @@ define float @flat_atomic_fadd_f32_rtn_pat(ptr %ptr, float %data) { define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(ptr %ptr, <2 x half> %data) { ; GFX940-LABEL: flat_atomic_fadd_v2f16_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 ; GFX940-NEXT: s_endpgm @@ -109,10 +109,10 @@ define <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) { define amdgpu_kernel void @flat_atomic_fadd_v2bf16_noret(ptr %ptr, <2 x i16> %data) { ; GFX940-LABEL: flat_atomic_fadd_v2bf16_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 ; GFX940-NEXT: s_endpgm @@ -134,12 +134,12 @@ define <2 x i16> @flat_atomic_fadd_v2bf16_rtn(ptr %ptr, <2 x i16> %data) { define amdgpu_kernel void @global_atomic_fadd_v2bf16_noret(ptr addrspace(1) %ptr, <2 x i16> %data) { ; GFX940-LABEL: global_atomic_fadd_v2bf16_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX940-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-NEXT: global_atomic_pk_add_bf16 v1, v0, s[2:3] +; GFX940-NEXT: global_atomic_pk_add_bf16 v1, v0, s[0:1] ; GFX940-NEXT: s_endpgm %ret = call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data) ret void @@ -159,11 +159,12 @@ define <2 x i16> @global_atomic_fadd_v2bf16_rtn(ptr addrspace(1) %ptr, <2 x i16> define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, <2 x half> %data) { ; GFX940-LABEL: local_atomic_fadd_v2f16_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s2 -; GFX940-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NEXT: v_mov_b32_e32 v1, s1 ; GFX940-NEXT: ds_pk_add_f16 v0, v1 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: s_endpgm %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0) ret void @@ -183,14 +184,12 @@ define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half> define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, <2 x i16> %data) { ; GFX940-LABEL: local_atomic_fadd_v2bf16_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s3 -; GFX940-NEXT: v_mov_b32_e32 v1, s2 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: ds_pk_add_bf16 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NEXT: ds_pk_add_f16 v0, v1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: s_endpgm %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data) ret void @@ -200,10 +199,8 @@ define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16> ; GFX940-LABEL: local_atomic_fadd_v2bf16_rtn: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1 +; GFX940-NEXT: ds_pk_add_rtn_f16 v0, v0, v1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data) ret <2 x i16> %ret diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll index ade6e55b482bb..59818b0b1bc39 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll @@ -134,8 +134,8 @@ define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset(ptr addrspace(1) ; GFX10-LABEL: global_atomic_csub_sgpr_base_offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x1000 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s2 @@ -147,10 +147,10 @@ define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset(ptr addrspace(1) ; GFX11-LABEL: global_atomic_csub_sgpr_base_offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0x1000 :: v_dual_mov_b32 v0, s2 +; GFX11-NEXT: v_dual_mov_b32 v1, 0x1000 :: v_dual_mov_b32 v0, s4 ; GFX11-NEXT: global_atomic_csub_u32 v0, v1, v0, s[0:1] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v[0:1], v0, off @@ -160,7 +160,7 @@ define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset(ptr addrspace(1) ; ; GFX12-LABEL: global_atomic_csub_sgpr_base_offset: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 +; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v1, v0, s[0:1] offset:4096 th:TH_ATOMIC_RETURN @@ -179,8 +179,8 @@ define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset_nortn(ptr addrspa ; GFX10-LABEL: global_atomic_csub_sgpr_base_offset_nortn: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x1000 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s2 @@ -190,16 +190,16 @@ define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset_nortn(ptr addrspa ; GFX11-LABEL: global_atomic_csub_sgpr_base_offset_nortn: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0x1000 :: v_dual_mov_b32 v0, s2 +; GFX11-NEXT: v_dual_mov_b32 v1, 0x1000 :: v_dual_mov_b32 v0, s4 ; GFX11-NEXT: global_atomic_csub_u32 v0, v1, v0, s[0:1] glc ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_atomic_csub_sgpr_base_offset_nortn: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 +; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v1, v0, s[0:1] offset:4096 th:TH_ATOMIC_RETURN diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll index ce402fb4e4abc..d1fa579ca9b4b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll @@ -628,7 +628,7 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> inreg %tdescr) { ; GFX1030-LABEL: image_bvh_intersect_ray_nsa_reassign: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX1030-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX1030-NEXT: v_mov_b32_e32 v5, 0x40400000 ; GFX1030-NEXT: v_mov_b32_e32 v6, 4.0 @@ -658,7 +658,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; ; GFX1013-LABEL: image_bvh_intersect_ray_nsa_reassign: ; GFX1013: ; %bb.0: -; GFX1013-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; GFX1013-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX1013-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; GFX1013-NEXT: v_mov_b32_e32 v7, 0x40a00000 ; GFX1013-NEXT: v_mov_b32_e32 v8, 0x40c00000 @@ -688,33 +688,33 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; ; GFX11-LABEL: image_bvh_intersect_ray_nsa_reassign: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 2, v0 -; GFX11-NEXT: s_mov_b32 s8, 0x40400000 -; GFX11-NEXT: s_mov_b32 s12, 0x40c00000 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_mov_b32 s10, 0x40a00000 ; GFX11-NEXT: s_mov_b32 s9, 4.0 +; GFX11-NEXT: s_mov_b32 s8, 0x40400000 +; GFX11-NEXT: s_mov_b32 s12, 0x40c00000 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX11-NEXT: s_mov_b32 s14, 0x41000000 ; GFX11-NEXT: s_mov_b32 s13, 0x40e00000 ; GFX11-NEXT: v_mov_b32_e32 v6, s12 ; GFX11-NEXT: v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v7, s13 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_mov_b32 s2, 2.0 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: s_mov_b32 s1, 1.0 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: s_mov_b32 s1, 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX11-NEXT: flat_load_b32 v9, v[0:1] ; GFX11-NEXT: flat_load_b32 v10, v[2:3] -; GFX11-NEXT: s_mov_b32 s2, 2.0 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s8 ; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 -; GFX11-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v4, s9 +; GFX11-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v5, s10 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v9, v10, v[0:2], v[3:5], v[6:8]], s[4:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -742,7 +742,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> inreg %tdescr) { ; GFX1030-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX1030-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX1030-NEXT: v_mov_b32_e32 v5, 0x44004200 ; GFX1030-NEXT: v_mov_b32_e32 v6, 0x46004500 @@ -769,7 +769,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ ; ; GFX1013-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: ; GFX1013: ; %bb.0: -; GFX1013-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; GFX1013-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX1013-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; GFX1013-NEXT: v_mov_b32_e32 v7, 0x48004700 ; GFX1013-NEXT: s_waitcnt lgkmcnt(0) @@ -796,28 +796,29 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ ; ; GFX11-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_mov_b32 s8, 0x42004600 ; GFX11-NEXT: s_mov_b32 s9, 0x44004700 ; GFX11-NEXT: s_mov_b32 s10, 0x45004800 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_mov_b32 s2, 2.0 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: s_mov_b32 s1, 1.0 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: s_mov_b32 s1, 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX11-NEXT: flat_load_b32 v6, v[0:1] ; GFX11-NEXT: flat_load_b32 v7, v[2:3] -; GFX11-NEXT: s_mov_b32 s2, 2.0 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s8 ; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 -; GFX11-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v4, s9 +; GFX11-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v5, s10 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v6, v7, v[0:2], v[3:5]], s[4:7] a16 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -846,8 +847,8 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 ; GFX1030-LABEL: image_bvh64_intersect_ray_nsa_reassign: ; GFX1030: ; %bb.0: ; GFX1030-NEXT: s_clause 0x1 -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x34 ; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX1030-NEXT: v_mov_b32_e32 v3, 0 ; GFX1030-NEXT: v_mov_b32_e32 v4, 1.0 @@ -865,7 +866,7 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 ; GFX1030-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX1030-NEXT: flat_load_dword v2, v[0:1] ; GFX1030-NEXT: v_mov_b32_e32 v0, 0xb36211c7 -; GFX1030-NEXT: v_mov_b32_e32 v1, 0x102 +; GFX1030-NEXT: v_bfrev_b32_e32 v1, 4.0 ; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[0:3] ; GFX1030-NEXT: s_waitcnt vmcnt(0) @@ -875,8 +876,8 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 ; GFX1013-LABEL: image_bvh64_intersect_ray_nsa_reassign: ; GFX1013: ; %bb.0: ; GFX1013-NEXT: s_clause 0x1 -; GFX1013-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX1013-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX1013-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1013-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 ; GFX1013-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX1013-NEXT: v_mov_b32_e32 v3, 0 ; GFX1013-NEXT: v_mov_b32_e32 v4, 1.0 @@ -888,13 +889,13 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 ; GFX1013-NEXT: v_mov_b32_e32 v10, 0x40e00000 ; GFX1013-NEXT: v_mov_b32_e32 v11, 0x41000000 ; GFX1013-NEXT: s_waitcnt lgkmcnt(0) -; GFX1013-NEXT: v_mov_b32_e32 v0, s2 -; GFX1013-NEXT: v_mov_b32_e32 v1, s3 +; GFX1013-NEXT: v_mov_b32_e32 v0, s0 +; GFX1013-NEXT: v_mov_b32_e32 v1, s1 ; GFX1013-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX1013-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX1013-NEXT: flat_load_dword v2, v[0:1] ; GFX1013-NEXT: v_mov_b32_e32 v0, 0xb36211c7 -; GFX1013-NEXT: v_mov_b32_e32 v1, 0x102 +; GFX1013-NEXT: v_bfrev_b32_e32 v1, 4.0 ; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[4:7] ; GFX1013-NEXT: s_waitcnt vmcnt(0) @@ -903,29 +904,30 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 ; ; GFX11-LABEL: image_bvh64_intersect_ray_nsa_reassign: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_mov_b32 s16, 0xb36211c7 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX11-NEXT: s_mov_b32 s6, 2.0 ; GFX11-NEXT: s_movk_i32 s17, 0x102 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v9, s16 :: v_dual_lshlrev_b32 v2, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x34 ; GFX11-NEXT: s_mov_b32 s8, 0x40400000 ; GFX11-NEXT: s_mov_b32 s12, 0x40c00000 -; GFX11-NEXT: s_mov_b32 s6, 2.0 ; GFX11-NEXT: s_mov_b32 s10, 0x40a00000 ; GFX11-NEXT: s_mov_b32 s9, 4.0 ; GFX11-NEXT: s_mov_b32 s14, 0x41000000 ; GFX11-NEXT: s_mov_b32 s13, 0x40e00000 +; GFX11-NEXT: v_dual_mov_b32 v10, s17 :: v_dual_mov_b32 v3, s8 ; GFX11-NEXT: v_mov_b32_e32 v6, s12 -; GFX11-NEXT: v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v9, s16 -; GFX11-NEXT: v_dual_mov_b32 v3, s8 :: v_dual_mov_b32 v4, s9 -; GFX11-NEXT: v_mov_b32_e32 v7, s13 +; GFX11-NEXT: v_mov_b32_e32 v4, s9 +; GFX11-NEXT: v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v5, s10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v0, s4 -; GFX11-NEXT: v_mov_b32_e32 v1, s5 +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_mov_b32 s5, 1.0 -; GFX11-NEXT: v_mov_b32_e32 v10, s17 +; GFX11-NEXT: v_mov_b32_e32 v7, s13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX11-NEXT: flat_load_b32 v11, v[0:1] @@ -957,8 +959,8 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray ; GFX1030-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign: ; GFX1030: ; %bb.0: ; GFX1030-NEXT: s_clause 0x1 -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x34 ; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX1030-NEXT: v_mov_b32_e32 v3, 0 ; GFX1030-NEXT: v_mov_b32_e32 v4, 1.0 @@ -973,7 +975,7 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray ; GFX1030-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX1030-NEXT: flat_load_dword v2, v[0:1] ; GFX1030-NEXT: v_mov_b32_e32 v0, 0xb36211c6 -; GFX1030-NEXT: v_mov_b32_e32 v1, 0x102 +; GFX1030-NEXT: v_bfrev_b32_e32 v1, 4.0 ; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[0:3] a16 ; GFX1030-NEXT: s_waitcnt vmcnt(0) @@ -983,8 +985,8 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray ; GFX1013-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign: ; GFX1013: ; %bb.0: ; GFX1013-NEXT: s_clause 0x1 -; GFX1013-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX1013-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX1013-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1013-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 ; GFX1013-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX1013-NEXT: v_mov_b32_e32 v3, 0 ; GFX1013-NEXT: v_mov_b32_e32 v4, 1.0 @@ -993,13 +995,13 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray ; GFX1013-NEXT: v_mov_b32_e32 v7, 0x46004500 ; GFX1013-NEXT: v_mov_b32_e32 v8, 0x48004700 ; GFX1013-NEXT: s_waitcnt lgkmcnt(0) -; GFX1013-NEXT: v_mov_b32_e32 v0, s2 -; GFX1013-NEXT: v_mov_b32_e32 v1, s3 +; GFX1013-NEXT: v_mov_b32_e32 v0, s0 +; GFX1013-NEXT: v_mov_b32_e32 v1, s1 ; GFX1013-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX1013-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX1013-NEXT: flat_load_dword v2, v[0:1] ; GFX1013-NEXT: v_mov_b32_e32 v0, 0xb36211c6 -; GFX1013-NEXT: v_mov_b32_e32 v1, 0x102 +; GFX1013-NEXT: v_bfrev_b32_e32 v1, 4.0 ; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[4:7] a16 ; GFX1013-NEXT: s_waitcnt vmcnt(0) @@ -1008,23 +1010,24 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray ; ; GFX11-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_mov_b32 s12, 0xb36211c6 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX11-NEXT: s_movk_i32 s13, 0x102 ; GFX11-NEXT: s_mov_b32 s6, 2.0 +; GFX11-NEXT: s_movk_i32 s13, 0x102 ; GFX11-NEXT: s_mov_b32 s8, 0x42004600 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX11-NEXT: s_mov_b32 s9, 0x44004700 ; GFX11-NEXT: s_mov_b32 s10, 0x45004800 -; GFX11-NEXT: v_dual_mov_b32 v3, s8 :: v_dual_mov_b32 v4, s9 +; GFX11-NEXT: v_dual_mov_b32 v6, s12 :: v_dual_mov_b32 v3, s8 +; GFX11-NEXT: v_mov_b32_e32 v7, s13 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x34 +; GFX11-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v5, s10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v0, s4 -; GFX11-NEXT: v_mov_b32_e32 v1, s5 -; GFX11-NEXT: s_mov_b32 s5, 1.0 +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: v_dual_mov_b32 v6, s12 :: v_dual_mov_b32 v7, s13 +; GFX11-NEXT: s_mov_b32 s5, 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX11-NEXT: flat_load_b32 v8, v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll index 0c60be9d94591..5074f8814546e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll @@ -4,11 +4,11 @@ define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) { ; GCN-LABEL: set_inactive: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 42 ; GCN-NEXT: s_not_b64 exec, exec @@ -23,7 +23,7 @@ define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) { define amdgpu_kernel void @set_inactive_64(ptr addrspace(1) %out, i64 %in) { ; GCN-LABEL: set_inactive_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -43,20 +43,20 @@ define amdgpu_kernel void @set_inactive_64(ptr addrspace(1) %out, i64 %in) { define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x i32> inreg %desc) { ; GCN-LABEL: set_inactive_scc: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_buffer_load_dword s2, s[4:7], 0x0 -; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_buffer_load_dword s4, s[4:7], 0x0 +; GCN-NEXT: s_load_dword s5, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s2, 56 -; GCN-NEXT: s_cselect_b32 s4, 1, 0 -; GCN-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NEXT: s_cmp_lg_u32 s4, 56 +; GCN-NEXT: s_cselect_b32 s3, 1, 0 +; GCN-NEXT: v_mov_b32_e32 v0, s5 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 42 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: s_mov_b32 s2, 1 -; GCN-NEXT: s_cmp_lg_u32 s4, 0 +; GCN-NEXT: s_cmp_lg_u32 s3, 0 ; GCN-NEXT: s_cbranch_scc0 .LBB2_2 ; GCN-NEXT: ; %bb.1: ; %.one ; GCN-NEXT: v_add_u32_e32 v1, vcc, 1, v0 @@ -96,12 +96,12 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) { ; GCN-LABEL: set_inactive_f32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: v_mov_b32_e32 v1, 0x40400000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: s_not_b64 exec, exec @@ -116,7 +116,7 @@ define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) { ; GCN-LABEL: set_inactive_f64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: s_mov_b32 s4, 0xcccccccd ; GCN-NEXT: s_mov_b32 s5, 0x4010cccc ; GCN-NEXT: v_mov_b32_e32 v2, s4 @@ -140,12 +140,12 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) { define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %in) { ; GCN-LABEL: set_inactive_v2i16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: v_mov_b32_e32 v1, 0x10001 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: s_not_b64 exec, exec @@ -160,12 +160,12 @@ define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> % define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> %in) { ; GCN-LABEL: set_inactive_v2f16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: v_mov_b32_e32 v1, 0x3c003c00 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: s_not_b64 exec, exec @@ -180,7 +180,7 @@ define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> %in) { ; GCN-LABEL: set_inactive_v2i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: s_mov_b32 s4, 1 ; GCN-NEXT: s_mov_b32 s5, s4 ; GCN-NEXT: v_mov_b32_e32 v2, s4 @@ -204,7 +204,7 @@ define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> % define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; GCN-LABEL: set_inactive_v2f32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: s_mov_b32 s4, 1.0 ; GCN-NEXT: s_mov_b32 s5, s4 ; GCN-NEXT: v_mov_b32_e32 v2, s4 @@ -228,12 +228,12 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float> define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in) { ; GCN-LABEL: set_inactive_v2bf16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: v_mov_b32_e32 v1, 0x3f803f80 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: s_not_b64 exec, exec @@ -248,7 +248,7 @@ define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloa define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> %in) { ; GCN-LABEL: set_inactive_v4i16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: s_mov_b32 s4, 0x10001 ; GCN-NEXT: s_mov_b32 s5, s4 ; GCN-NEXT: v_mov_b32_e32 v2, s4 @@ -272,7 +272,7 @@ define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> % define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half> %in) { ; GCN-LABEL: set_inactive_v4f16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: s_mov_b32 s4, 0x3c003c00 ; GCN-NEXT: s_mov_b32 s5, s4 ; GCN-NEXT: v_mov_b32_e32 v2, s4 @@ -296,7 +296,7 @@ define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half> define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloat> %in) { ; GCN-LABEL: set_inactive_v4bf16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: s_mov_b32 s4, 0x3f803f80 ; GCN-NEXT: s_mov_b32 s5, s4 ; GCN-NEXT: v_mov_b32_e32 v2, s4 @@ -320,7 +320,7 @@ define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloa define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) { ; GCN-LABEL: set_inactive_p0: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -340,11 +340,11 @@ define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) { define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(2) %in) { ; GCN-LABEL: set_inactive_p2: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_not_b64 exec, exec @@ -359,11 +359,11 @@ define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(3) %in) { ; GCN-LABEL: set_inactive_p3: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_not_b64 exec, exec @@ -378,11 +378,11 @@ define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(5) %in) { ; GCN-LABEL: set_inactive_p5: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_not_b64 exec, exec @@ -397,11 +397,11 @@ define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @set_inactive_p6(ptr addrspace(1) %out, ptr addrspace(6) %in) { ; GCN-LABEL: set_inactive_p6: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_not_b64 exec, exec diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll index 1092bb4dc834a..746bd739644a9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll @@ -6,7 +6,7 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) { ; GFX8-LABEL: dpp_test: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_mov_b32_e32 v0, s3 @@ -19,7 +19,7 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) { ; ; GFX10-LABEL: dpp_test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-NEXT: v_mov_b32_e32 v1, s7 @@ -30,7 +30,7 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) { ; ; GFX11-LABEL: dpp_test: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 @@ -46,7 +46,7 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) { define amdgpu_kernel void @update_dppi64_test(ptr addrspace(1) %arg, i64 %in1, i64 %in2) { ; GFX8-LABEL: update_dppi64_test: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -64,7 +64,7 @@ define amdgpu_kernel void @update_dppi64_test(ptr addrspace(1) %arg, i64 %in1, i ; ; GFX10-LABEL: update_dppi64_test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] @@ -78,10 +78,11 @@ define amdgpu_kernel void @update_dppi64_test(ptr addrspace(1) %arg, i64 %in1, i ; ; GFX11-LABEL: update_dppi64_test: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 @@ -101,7 +102,7 @@ define amdgpu_kernel void @update_dppi64_test(ptr addrspace(1) %arg, i64 %in1, i define amdgpu_kernel void @update_dppf64_test(ptr addrspace(1) %arg, double %in1, double %in2) { ; GFX8-LABEL: update_dppf64_test: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -119,7 +120,7 @@ define amdgpu_kernel void @update_dppf64_test(ptr addrspace(1) %arg, double %in1 ; ; GFX10-LABEL: update_dppf64_test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] @@ -133,10 +134,11 @@ define amdgpu_kernel void @update_dppf64_test(ptr addrspace(1) %arg, double %in1 ; ; GFX11-LABEL: update_dppf64_test: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 @@ -156,7 +158,7 @@ define amdgpu_kernel void @update_dppf64_test(ptr addrspace(1) %arg, double %in1 define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32> %in1, <2 x i32> %in2) { ; GFX8-LABEL: update_dppv2i32_test: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -174,7 +176,7 @@ define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32> ; ; GFX10-LABEL: update_dppv2i32_test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] @@ -188,10 +190,11 @@ define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32> ; ; GFX11-LABEL: update_dppv2i32_test: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 @@ -211,7 +214,7 @@ define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32> define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x float> %in1, <2 x float> %in2) { ; GFX8-LABEL: update_dppv2f32_test: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -229,7 +232,7 @@ define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x floa ; ; GFX10-LABEL: update_dppv2f32_test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] @@ -243,10 +246,11 @@ define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x floa ; ; GFX11-LABEL: update_dppv2f32_test: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 @@ -266,7 +270,7 @@ define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x floa define amdgpu_kernel void @update_dpp_p0_test(ptr addrspace(1) %arg, ptr %in1, ptr %in2) { ; GFX8-LABEL: update_dpp_p0_test: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -284,7 +288,7 @@ define amdgpu_kernel void @update_dpp_p0_test(ptr addrspace(1) %arg, ptr %in1, p ; ; GFX10-LABEL: update_dpp_p0_test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] @@ -298,10 +302,11 @@ define amdgpu_kernel void @update_dpp_p0_test(ptr addrspace(1) %arg, ptr %in1, p ; ; GFX11-LABEL: update_dpp_p0_test: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 @@ -321,7 +326,7 @@ define amdgpu_kernel void @update_dpp_p0_test(ptr addrspace(1) %arg, ptr %in1, p define amdgpu_kernel void @update_dpp_p3_test(ptr addrspace(3) %arg, ptr addrspace(3) %in1, ptr %in2) { ; GFX8-LABEL: update_dpp_p3_test: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -336,7 +341,7 @@ define amdgpu_kernel void @update_dpp_p3_test(ptr addrspace(3) %arg, ptr addrspa ; ; GFX10-LABEL: update_dpp_p3_test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0 @@ -349,7 +354,8 @@ define amdgpu_kernel void @update_dpp_p3_test(ptr addrspace(3) %arg, ptr addrspa ; ; GFX11-LABEL: update_dpp_p3_test: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_nc_u32_e32 v0, s0, v0 @@ -371,11 +377,11 @@ define amdgpu_kernel void @update_dpp_p5_test(ptr addrspace(5) %arg, ptr addrspa ; GFX8-LABEL: update_dpp_p5_test: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_mov_b32 s90, -1 ; GFX8-NEXT: s_mov_b32 s91, 0xe80000 -; GFX8-NEXT: s_add_u32 s88, s88, s3 +; GFX8-NEXT: s_add_u32 s88, s88, s9 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: s_addc_u32 s89, s89, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -390,26 +396,27 @@ define amdgpu_kernel void @update_dpp_p5_test(ptr addrspace(5) %arg, ptr addrspa ; ; GFX10-LABEL: update_dpp_p5_test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 -; GFX10-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 -; GFX10-NEXT: s_mov_b32 s6, -1 -; GFX10-NEXT: s_mov_b32 s7, 0x31c16000 -; GFX10-NEXT: s_add_u32 s4, s4, s3 -; GFX10-NEXT: s_addc_u32 s5, s5, 0 +; GFX10-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX10-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX10-NEXT: s_mov_b32 s14, -1 +; GFX10-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX10-NEXT: s_add_u32 s12, s12, s9 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-NEXT: buffer_load_dword v1, v0, s[4:7], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v0, s[12:15], 0 offen ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 -; GFX10-NEXT: buffer_store_dword v2, v0, s[4:7], 0 offen +; GFX10-NEXT: buffer_store_dword v2, v0, s[12:15], 0 offen ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: update_dpp_p5_test: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_nc_u32_e32 v0, s0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll index 8a2274cbfbf62..3bc9a582ebd96 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll @@ -6,83 +6,83 @@ define amdgpu_kernel void @sdivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i32 %x, i32 %y) { ; GFX8-LABEL: sdivrem_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_ashr_i32 s8, s7, 31 -; GFX8-NEXT: s_add_i32 s0, s7, s8 -; GFX8-NEXT: s_xor_b32 s7, s0, s8 -; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX8-NEXT: s_sub_i32 s0, 0, s7 +; GFX8-NEXT: s_ashr_i32 s8, s5, 31 +; GFX8-NEXT: s_add_i32 s0, s5, s8 +; GFX8-NEXT: s_xor_b32 s5, s0, s8 +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX8-NEXT: s_sub_i32 s0, 0, s5 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX8-NEXT: s_ashr_i32 s4, s6, 31 -; GFX8-NEXT: s_add_i32 s5, s6, s4 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_ashr_i32 s6, s4, 31 +; GFX8-NEXT: s_add_i32 s4, s4, s6 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX8-NEXT: s_xor_b32 s5, s5, s4 -; GFX8-NEXT: s_xor_b32 s6, s4, s8 +; GFX8-NEXT: s_xor_b32 s4, s4, s6 +; GFX8-NEXT: s_xor_b32 s7, s6, s8 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 -; GFX8-NEXT: v_mul_hi_u32 v2, s5, v0 +; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mul_lo_u32 v3, v2, s7 +; GFX8-NEXT: v_mul_lo_u32 v3, v2, s5 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s5, v3 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s4, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 -; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 +; GFX8-NEXT: v_xor_b32_e32 v2, s7, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s6, v2 -; GFX8-NEXT: v_xor_b32_e32 v3, s4, v3 +; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s7, v2 +; GFX8-NEXT: v_xor_b32_e32 v3, s6, v3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s4, v3 +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s6, v3 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_dword v[0:1], v3 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: sdivrem_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s6, s1, 31 -; GFX9-NEXT: s_add_i32 s1, s1, s6 -; GFX9-NEXT: s_xor_b32 s7, s1, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX9-NEXT: s_sub_i32 s1, 0, s7 +; GFX9-NEXT: s_ashr_i32 s4, s1, 31 +; GFX9-NEXT: s_add_i32 s1, s1, s4 +; GFX9-NEXT: s_xor_b32 s5, s1, s4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX9-NEXT: s_sub_i32 s1, 0, s5 ; GFX9-NEXT: s_ashr_i32 s8, s0, 31 ; GFX9-NEXT: s_add_i32 s0, s0, s8 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_xor_b32 s9, s0, s8 +; GFX9-NEXT: s_xor_b32 s4, s8, s4 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_xor_b32 s4, s8, s6 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, v0, s7 +; GFX9-NEXT: v_mul_lo_u32 v1, v0, s5 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 ; GFX9-NEXT: v_sub_u32_e32 v1, s9, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 ; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 @@ -95,16 +95,17 @@ define amdgpu_kernel void @sdivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX10-LABEL: sdivrem_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_ashr_i32 s6, s1, 31 +; GFX10-NEXT: s_ashr_i32 s4, s1, 31 ; GFX10-NEXT: s_ashr_i32 s8, s0, 31 -; GFX10-NEXT: s_add_i32 s1, s1, s6 +; GFX10-NEXT: s_add_i32 s1, s1, s4 ; GFX10-NEXT: s_add_i32 s0, s0, s8 -; GFX10-NEXT: s_xor_b32 s7, s1, s6 +; GFX10-NEXT: s_xor_b32 s5, s1, s4 ; GFX10-NEXT: s_xor_b32 s0, s0, s8 -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX10-NEXT: s_sub_i32 s1, 0, s7 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX10-NEXT: s_sub_i32 s1, 0, s5 +; GFX10-NEXT: s_xor_b32 s4, s8, s4 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -112,18 +113,17 @@ define amdgpu_kernel void @sdivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX10-NEXT: v_mul_lo_u32 v1, v0, s7 +; GFX10-NEXT: v_mul_lo_u32 v1, v0, s5 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s0, v1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: s_xor_b32 s4, s8, s6 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s7, v1 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v1 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s5, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s5, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s7, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s5, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s5, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -145,7 +145,7 @@ define amdgpu_kernel void @sdivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i64 %x, i64 %y) { ; GFX8-LABEL: sdivrem_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_ashr_i32 s2, s9, 31 ; GFX8-NEXT: s_ashr_i32 s12, s11, 31 @@ -305,7 +305,7 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX9-LABEL: sdivrem_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s2, s9, 31 ; GFX9-NEXT: s_ashr_i32 s12, s11, 31 @@ -459,7 +459,7 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX10-LABEL: sdivrem_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_ashr_i32 s2, s9, 31 ; GFX10-NEXT: s_ashr_i32 s12, s11, 31 @@ -616,7 +616,7 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i32> %x, <2 x i32> %y) { ; GFX8-LABEL: sdivrem_v2i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_ashr_i32 s2, s10, 31 ; GFX8-NEXT: s_add_i32 s0, s10, s2 @@ -692,7 +692,7 @@ define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX9-LABEL: sdivrem_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s0, s14, 31 ; GFX9-NEXT: s_add_i32 s1, s14, s0 @@ -765,7 +765,7 @@ define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX10-LABEL: sdivrem_v2i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_ashr_i32 s1, s14, 31 ; GFX10-NEXT: s_ashr_i32 s2, s15, 31 @@ -845,8 +845,8 @@ define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <4 x i32> %x, <4 x i32> %y) { ; GFX8-LABEL: sdivrem_v4i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_ashr_i32 s2, s12, 31 ; GFX8-NEXT: s_add_i32 s0, s12, s2 @@ -986,19 +986,19 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX9-LABEL: sdivrem_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s6, s12, 31 -; GFX9-NEXT: s_add_i32 s0, s12, s6 -; GFX9-NEXT: s_xor_b32 s7, s0, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_ashr_i32 s4, s13, 31 -; GFX9-NEXT: s_add_i32 s5, s13, s4 +; GFX9-NEXT: s_ashr_i32 s4, s12, 31 +; GFX9-NEXT: s_add_i32 s0, s12, s4 +; GFX9-NEXT: s_xor_b32 s5, s0, s4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_ashr_i32 s6, s13, 31 +; GFX9-NEXT: s_add_i32 s7, s13, s6 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_xor_b32 s5, s5, s4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s5 -; GFX9-NEXT: s_sub_i32 s13, 0, s7 +; GFX9-NEXT: s_xor_b32 s7, s7, s6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 +; GFX9-NEXT: s_sub_i32 s13, 0, s5 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 @@ -1009,7 +1009,7 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: s_xor_b32 s8, s8, s12 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX9-NEXT: s_sub_i32 s13, 0, s5 +; GFX9-NEXT: s_sub_i32 s13, 0, s7 ; GFX9-NEXT: v_mul_lo_u32 v3, s13, v1 ; GFX9-NEXT: s_ashr_i32 s13, s9, 31 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 @@ -1017,62 +1017,62 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_mul_hi_u32 v2, v1, v3 ; GFX9-NEXT: s_add_i32 s9, s9, s13 ; GFX9-NEXT: s_xor_b32 s9, s9, s13 -; GFX9-NEXT: v_mul_lo_u32 v3, v0, s7 +; GFX9-NEXT: v_mul_lo_u32 v3, v0, s5 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 ; GFX9-NEXT: v_add_u32_e32 v2, 1, v0 ; GFX9-NEXT: v_mul_hi_u32 v1, s9, v1 ; GFX9-NEXT: v_sub_u32_e32 v3, s8, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_subrev_u32_e32 v2, s7, v3 +; GFX9-NEXT: v_subrev_u32_e32 v2, s5, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v2 -; GFX9-NEXT: s_xor_b32 s6, s12, s6 +; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v2 +; GFX9-NEXT: s_xor_b32 s4, s12, s4 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX9-NEXT: v_xor_b32_e32 v0, s6, v0 -; GFX9-NEXT: v_mul_lo_u32 v3, v1, s5 -; GFX9-NEXT: v_subrev_u32_e32 v0, s6, v0 -; GFX9-NEXT: s_ashr_i32 s6, s14, 31 -; GFX9-NEXT: s_add_i32 s7, s14, s6 +; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 +; GFX9-NEXT: v_mul_lo_u32 v3, v1, s7 +; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 +; GFX9-NEXT: s_ashr_i32 s4, s14, 31 +; GFX9-NEXT: s_add_i32 s5, s14, s4 ; GFX9-NEXT: v_xor_b32_e32 v2, s12, v2 -; GFX9-NEXT: s_xor_b32 s7, s7, s6 +; GFX9-NEXT: s_xor_b32 s5, s5, s4 ; GFX9-NEXT: v_subrev_u32_e32 v4, s12, v2 ; GFX9-NEXT: v_sub_u32_e32 v2, s9, v3 -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s7 +; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s5 ; GFX9-NEXT: v_add_u32_e32 v5, 1, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GFX9-NEXT: v_subrev_u32_e32 v5, s5, v2 +; GFX9-NEXT: v_subrev_u32_e32 v5, s7, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; GFX9-NEXT: v_add_u32_e32 v5, 1, v1 ; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v2 -; GFX9-NEXT: s_sub_i32 s8, 0, s7 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 +; GFX9-NEXT: s_sub_i32 s8, 0, s5 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: v_mul_lo_u32 v5, s8, v3 -; GFX9-NEXT: s_xor_b32 s4, s13, s4 -; GFX9-NEXT: v_xor_b32_e32 v1, s4, v1 -; GFX9-NEXT: v_subrev_u32_e32 v1, s4, v1 -; GFX9-NEXT: s_ashr_i32 s4, s15, 31 -; GFX9-NEXT: s_add_i32 s9, s15, s4 +; GFX9-NEXT: s_xor_b32 s6, s13, s6 +; GFX9-NEXT: v_xor_b32_e32 v1, s6, v1 +; GFX9-NEXT: v_subrev_u32_e32 v1, s6, v1 +; GFX9-NEXT: s_ashr_i32 s6, s15, 31 +; GFX9-NEXT: s_add_i32 s9, s15, s6 ; GFX9-NEXT: v_mul_hi_u32 v5, v3, v5 -; GFX9-NEXT: s_xor_b32 s9, s9, s4 +; GFX9-NEXT: s_xor_b32 s9, s9, s6 ; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s9 -; GFX9-NEXT: v_subrev_u32_e32 v6, s5, v2 -; GFX9-NEXT: s_ashr_i32 s5, s10, 31 -; GFX9-NEXT: s_add_i32 s8, s10, s5 -; GFX9-NEXT: s_xor_b32 s8, s8, s5 +; GFX9-NEXT: v_subrev_u32_e32 v6, s7, v2 +; GFX9-NEXT: s_ashr_i32 s7, s10, 31 +; GFX9-NEXT: s_add_i32 s8, s10, s7 +; GFX9-NEXT: s_xor_b32 s8, s8, s7 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 ; GFX9-NEXT: v_mul_hi_u32 v3, s8, v3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; GFX9-NEXT: v_xor_b32_e32 v2, s13, v2 -; GFX9-NEXT: v_mul_lo_u32 v6, v3, s7 +; GFX9-NEXT: v_mul_lo_u32 v6, v3, s5 ; GFX9-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v7 ; GFX9-NEXT: v_subrev_u32_e32 v5, s13, v2 @@ -1080,27 +1080,27 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: s_sub_i32 s8, 0, s9 ; GFX9-NEXT: v_mul_lo_u32 v8, s8, v7 ; GFX9-NEXT: v_add_u32_e32 v6, 1, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc -; GFX9-NEXT: v_subrev_u32_e32 v6, s7, v2 +; GFX9-NEXT: v_subrev_u32_e32 v6, s5, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; GFX9-NEXT: v_mul_hi_u32 v8, v7, v8 ; GFX9-NEXT: v_add_u32_e32 v6, 1, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc -; GFX9-NEXT: v_subrev_u32_e32 v6, s7, v2 -; GFX9-NEXT: s_ashr_i32 s7, s11, 31 -; GFX9-NEXT: s_add_i32 s8, s11, s7 -; GFX9-NEXT: s_xor_b32 s8, s8, s7 +; GFX9-NEXT: v_subrev_u32_e32 v6, s5, v2 +; GFX9-NEXT: s_ashr_i32 s5, s11, 31 +; GFX9-NEXT: s_add_i32 s8, s11, s5 +; GFX9-NEXT: s_xor_b32 s8, s8, s5 ; GFX9-NEXT: v_add_u32_e32 v7, v7, v8 ; GFX9-NEXT: v_mul_hi_u32 v7, s8, v7 -; GFX9-NEXT: s_xor_b32 s6, s5, s6 +; GFX9-NEXT: s_xor_b32 s4, s7, s4 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v6, vcc -; GFX9-NEXT: v_xor_b32_e32 v2, s6, v3 +; GFX9-NEXT: v_xor_b32_e32 v2, s4, v3 ; GFX9-NEXT: v_mul_lo_u32 v3, v7, s9 ; GFX9-NEXT: v_add_u32_e32 v8, 1, v7 -; GFX9-NEXT: s_xor_b32 s4, s7, s4 -; GFX9-NEXT: v_subrev_u32_e32 v2, s6, v2 +; GFX9-NEXT: v_subrev_u32_e32 v2, s4, v2 +; GFX9-NEXT: s_xor_b32 s4, s5, s6 ; GFX9-NEXT: v_sub_u32_e32 v3, s8, v3 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc @@ -1112,12 +1112,12 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_subrev_u32_e32 v8, s9, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v3, v8, vcc ; GFX9-NEXT: v_xor_b32_e32 v3, s4, v7 -; GFX9-NEXT: v_xor_b32_e32 v6, s5, v6 +; GFX9-NEXT: v_xor_b32_e32 v6, s7, v6 ; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v3 -; GFX9-NEXT: v_xor_b32_e32 v7, s7, v8 +; GFX9-NEXT: v_xor_b32_e32 v7, s5, v8 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-NEXT: v_subrev_u32_e32 v6, s5, v6 -; GFX9-NEXT: v_subrev_u32_e32 v7, s7, v7 +; GFX9-NEXT: v_subrev_u32_e32 v6, s7, v6 +; GFX9-NEXT: v_subrev_u32_e32 v7, s5, v7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[2:3] @@ -1125,18 +1125,18 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX10-LABEL: sdivrem_v4i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_ashr_i32 s0, s12, 31 ; GFX10-NEXT: s_ashr_i32 s1, s13, 31 ; GFX10-NEXT: s_ashr_i32 s2, s14, 31 ; GFX10-NEXT: s_ashr_i32 s3, s15, 31 -; GFX10-NEXT: s_add_i32 s6, s12, s0 -; GFX10-NEXT: s_add_i32 s7, s13, s1 +; GFX10-NEXT: s_add_i32 s4, s12, s0 +; GFX10-NEXT: s_add_i32 s5, s13, s1 ; GFX10-NEXT: s_add_i32 s12, s14, s2 ; GFX10-NEXT: s_add_i32 s13, s15, s3 -; GFX10-NEXT: s_xor_b32 s14, s6, s0 -; GFX10-NEXT: s_xor_b32 s15, s7, s1 +; GFX10-NEXT: s_xor_b32 s14, s4, s0 +; GFX10-NEXT: s_xor_b32 s15, s5, s1 ; GFX10-NEXT: s_xor_b32 s12, s12, s2 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s14 ; GFX10-NEXT: s_xor_b32 s13, s13, s3 @@ -1144,11 +1144,11 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s12 ; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s13 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX10-NEXT: s_sub_i32 s6, 0, s14 +; GFX10-NEXT: s_sub_i32 s4, 0, s14 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GFX10-NEXT: s_sub_i32 s7, 0, s15 +; GFX10-NEXT: s_sub_i32 s5, 0, s15 ; GFX10-NEXT: s_sub_i32 s19, 0, s12 ; GFX10-NEXT: s_ashr_i32 s16, s8, 31 ; GFX10-NEXT: s_ashr_i32 s17, s9, 31 @@ -1163,22 +1163,22 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX10-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX10-NEXT: v_mul_lo_u32 v4, s6, v0 -; GFX10-NEXT: s_sub_i32 s6, 0, s13 -; GFX10-NEXT: v_mul_lo_u32 v5, s7, v1 +; GFX10-NEXT: v_mul_lo_u32 v4, s4, v0 +; GFX10-NEXT: s_sub_i32 s4, 0, s13 +; GFX10-NEXT: v_mul_lo_u32 v5, s5, v1 ; GFX10-NEXT: v_mul_lo_u32 v6, s19, v2 -; GFX10-NEXT: v_mul_lo_u32 v7, s6, v3 +; GFX10-NEXT: v_mul_lo_u32 v7, s4, v3 ; GFX10-NEXT: s_ashr_i32 s19, s11, 31 -; GFX10-NEXT: s_add_i32 s6, s8, s16 -; GFX10-NEXT: s_add_i32 s7, s9, s17 +; GFX10-NEXT: s_add_i32 s4, s8, s16 +; GFX10-NEXT: s_add_i32 s5, s9, s17 ; GFX10-NEXT: v_mul_hi_u32 v4, v0, v4 ; GFX10-NEXT: s_add_i32 s8, s10, s18 ; GFX10-NEXT: v_mul_hi_u32 v5, v1, v5 ; GFX10-NEXT: v_mul_hi_u32 v6, v2, v6 ; GFX10-NEXT: v_mul_hi_u32 v7, v3, v7 ; GFX10-NEXT: s_add_i32 s9, s11, s19 -; GFX10-NEXT: s_xor_b32 s10, s6, s16 -; GFX10-NEXT: s_xor_b32 s11, s7, s17 +; GFX10-NEXT: s_xor_b32 s10, s4, s16 +; GFX10-NEXT: s_xor_b32 s11, s5, s17 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v4 ; GFX10-NEXT: s_xor_b32 s8, s8, s18 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v5 @@ -1190,7 +1190,7 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_mul_hi_u32 v2, s8, v2 ; GFX10-NEXT: v_mul_hi_u32 v3, s9, v3 ; GFX10-NEXT: s_xor_b32 s22, s18, s2 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 ; GFX10-NEXT: v_mul_lo_u32 v4, v0, s14 ; GFX10-NEXT: v_mul_lo_u32 v5, v1, s15 ; GFX10-NEXT: v_mul_lo_u32 v6, v2, s12 @@ -1271,8 +1271,8 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i64> %x, <2 x i64> %y) { ; GFX8-LABEL: sdivrem_v2i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 +; GFX8-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x20 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_ashr_i32 s4, s13, 31 ; GFX8-NEXT: s_ashr_i32 s6, s1, 31 @@ -1582,8 +1582,8 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX9-LABEL: sdivrem_v2i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x20 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s4, s13, 31 ; GFX9-NEXT: s_ashr_i32 s6, s1, 31 @@ -1885,8 +1885,8 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-LABEL: sdivrem_v2i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 -; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x20 +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_ashr_i32 s16, s1, 31 ; GFX10-NEXT: s_ashr_i32 s4, s13, 31 @@ -2187,25 +2187,25 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i8 %x, i8 %y) { ; GFX8-LABEL: sdiv_i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX8-NEXT: s_load_dword s4, s[6:7], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_i32 s0, s6, 0x80008 -; GFX8-NEXT: s_ashr_i32 s7, s0, 31 -; GFX8-NEXT: s_add_i32 s0, s0, s7 -; GFX8-NEXT: s_xor_b32 s8, s0, s7 +; GFX8-NEXT: s_bfe_i32 s0, s4, 0x80008 +; GFX8-NEXT: s_ashr_i32 s5, s0, 31 +; GFX8-NEXT: s_add_i32 s0, s0, s5 +; GFX8-NEXT: s_xor_b32 s8, s0, s5 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX8-NEXT: s_sub_i32 s0, 0, s8 +; GFX8-NEXT: s_sext_i32_i8 s4, s4 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX8-NEXT: s_sext_i32_i8 s4, s6 -; GFX8-NEXT: s_ashr_i32 s5, s4, 31 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_ashr_i32 s6, s4, 31 +; GFX8-NEXT: s_add_i32 s4, s4, s6 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX8-NEXT: s_add_i32 s4, s4, s5 -; GFX8-NEXT: s_xor_b32 s4, s4, s5 -; GFX8-NEXT: s_xor_b32 s6, s5, s7 +; GFX8-NEXT: s_xor_b32 s4, s4, s6 +; GFX8-NEXT: s_xor_b32 s5, s6, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -2222,52 +2222,52 @@ define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s8, v3 -; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 +; GFX8-NEXT: v_xor_b32_e32 v2, s5, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s6, v2 -; GFX8-NEXT: v_xor_b32_e32 v3, s5, v3 +; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s5, v2 +; GFX8-NEXT: v_xor_b32_e32 v3, s6, v3 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s5, v3 +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s6, v3 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_byte v[0:1], v3 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x10 +; GFX9-NEXT: s_load_dword s0, s[6:7], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_i32 s1, s0, 0x80008 -; GFX9-NEXT: s_ashr_i32 s6, s1, 31 -; GFX9-NEXT: s_add_i32 s1, s1, s6 -; GFX9-NEXT: s_xor_b32 s7, s1, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX9-NEXT: s_sub_i32 s1, 0, s7 +; GFX9-NEXT: s_ashr_i32 s4, s1, 31 +; GFX9-NEXT: s_add_i32 s1, s1, s4 +; GFX9-NEXT: s_xor_b32 s5, s1, s4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX9-NEXT: s_sub_i32 s1, 0, s5 ; GFX9-NEXT: s_sext_i32_i8 s0, s0 ; GFX9-NEXT: s_ashr_i32 s8, s0, 31 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_add_i32 s0, s0, s8 ; GFX9-NEXT: s_xor_b32 s9, s0, s8 +; GFX9-NEXT: s_xor_b32 s4, s8, s4 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_xor_b32 s4, s8, s6 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, v0, s7 +; GFX9-NEXT: v_mul_lo_u32 v1, v0, s5 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 ; GFX9-NEXT: v_sub_u32_e32 v1, s9, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 ; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 @@ -2280,18 +2280,19 @@ define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out ; ; GFX10-LABEL: sdiv_i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[4:5], 0x10 +; GFX10-NEXT: s_load_dword s0, s[6:7], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_bfe_i32 s1, s0, 0x80008 ; GFX10-NEXT: s_sext_i32_i8 s0, s0 -; GFX10-NEXT: s_ashr_i32 s6, s1, 31 +; GFX10-NEXT: s_ashr_i32 s4, s1, 31 ; GFX10-NEXT: s_ashr_i32 s8, s0, 31 -; GFX10-NEXT: s_add_i32 s1, s1, s6 +; GFX10-NEXT: s_add_i32 s1, s1, s4 ; GFX10-NEXT: s_add_i32 s0, s0, s8 -; GFX10-NEXT: s_xor_b32 s7, s1, s6 +; GFX10-NEXT: s_xor_b32 s5, s1, s4 ; GFX10-NEXT: s_xor_b32 s0, s0, s8 -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX10-NEXT: s_sub_i32 s1, 0, s7 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX10-NEXT: s_sub_i32 s1, 0, s5 +; GFX10-NEXT: s_xor_b32 s4, s8, s4 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -2299,18 +2300,17 @@ define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out ; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX10-NEXT: v_mul_lo_u32 v1, v0, s7 +; GFX10-NEXT: v_mul_lo_u32 v1, v0, s5 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s0, v1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: s_xor_b32 s4, s8, s6 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s7, v1 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v1 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s5, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s5, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s7, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s5, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s5, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -2332,14 +2332,14 @@ define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i8> %x, <2 x i8> %y) { ; GFX8-LABEL: sdivrem_v2i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x10 +; GFX8-NEXT: s_load_dword s2, s[6:7], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_i32 s0, s2, 0x80010 ; GFX8-NEXT: s_ashr_i32 s3, s0, 31 ; GFX8-NEXT: s_add_i32 s0, s0, s3 ; GFX8-NEXT: s_xor_b32 s8, s0, s3 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX8-NEXT: s_sub_i32 s6, 0, s8 +; GFX8-NEXT: s_sub_i32 s4, 0, s8 ; GFX8-NEXT: s_bfe_i32 s1, s2, 0x80018 ; GFX8-NEXT: s_ashr_i32 s10, s1, 31 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -2351,10 +2351,10 @@ define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s11 ; GFX8-NEXT: s_ashr_i32 s9, s0, 31 ; GFX8-NEXT: s_add_i32 s0, s0, s9 -; GFX8-NEXT: v_mul_lo_u32 v1, s6, v0 +; GFX8-NEXT: v_mul_lo_u32 v1, s4, v0 ; GFX8-NEXT: s_xor_b32 s0, s0, s9 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0 @@ -2420,45 +2420,45 @@ define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX9-LABEL: sdivrem_v2i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s0, s6, 0x80010 -; GFX9-NEXT: s_ashr_i32 s7, s0, 31 -; GFX9-NEXT: s_add_i32 s0, s0, s7 -; GFX9-NEXT: s_xor_b32 s8, s0, s7 +; GFX9-NEXT: s_bfe_i32 s0, s4, 0x80010 +; GFX9-NEXT: s_ashr_i32 s5, s0, 31 +; GFX9-NEXT: s_add_i32 s0, s0, s5 +; GFX9-NEXT: s_xor_b32 s8, s0, s5 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_bfe_i32 s5, s6, 0x80018 -; GFX9-NEXT: s_ashr_i32 s9, s5, 31 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_bfe_i32 s7, s4, 0x80018 +; GFX9-NEXT: s_ashr_i32 s9, s7, 31 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_add_i32 s5, s5, s9 -; GFX9-NEXT: s_xor_b32 s5, s5, s9 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s5 +; GFX9-NEXT: s_add_i32 s7, s7, s9 +; GFX9-NEXT: s_xor_b32 s7, s7, s9 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: s_sub_i32 s10, 0, s8 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: s_sext_i32_i8 s4, s6 +; GFX9-NEXT: s_sext_i32_i8 s6, s4 ; GFX9-NEXT: v_mul_lo_u32 v2, s10, v0 -; GFX9-NEXT: s_ashr_i32 s10, s4, 31 +; GFX9-NEXT: s_ashr_i32 s10, s6, 31 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX9-NEXT: s_add_i32 s4, s4, s10 -; GFX9-NEXT: s_xor_b32 s4, s4, s10 -; GFX9-NEXT: s_sub_i32 s11, 0, s5 +; GFX9-NEXT: s_add_i32 s6, s6, s10 +; GFX9-NEXT: s_xor_b32 s6, s6, s10 +; GFX9-NEXT: s_sub_i32 s11, 0, s7 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, s6, v0 ; GFX9-NEXT: v_mul_lo_u32 v2, s11, v1 -; GFX9-NEXT: s_bfe_i32 s6, s6, 0x80008 -; GFX9-NEXT: s_ashr_i32 s11, s6, 31 +; GFX9-NEXT: s_bfe_i32 s4, s4, 0x80008 +; GFX9-NEXT: s_ashr_i32 s11, s4, 31 ; GFX9-NEXT: v_mul_lo_u32 v3, v0, s8 ; GFX9-NEXT: v_mul_hi_u32 v2, v1, v2 -; GFX9-NEXT: s_add_i32 s6, s6, s11 +; GFX9-NEXT: s_add_i32 s4, s4, s11 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 -; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3 +; GFX9-NEXT: v_sub_u32_e32 v3, s6, v3 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 -; GFX9-NEXT: s_xor_b32 s4, s6, s11 +; GFX9-NEXT: s_xor_b32 s4, s4, s11 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX9-NEXT: v_subrev_u32_e32 v4, s8, v3 @@ -2469,25 +2469,25 @@ define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX9-NEXT: v_subrev_u32_e32 v4, s8, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, v1, s5 +; GFX9-NEXT: v_mul_lo_u32 v3, v1, s7 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 -; GFX9-NEXT: s_xor_b32 s6, s10, s7 -; GFX9-NEXT: v_xor_b32_e32 v0, s6, v0 +; GFX9-NEXT: s_xor_b32 s5, s10, s5 +; GFX9-NEXT: v_xor_b32_e32 v0, s5, v0 ; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s5, v3 +; GFX9-NEXT: v_subrev_u32_e32 v4, s7, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX9-NEXT: s_xor_b32 s4, s11, s9 ; GFX9-NEXT: v_xor_b32_e32 v1, s4, v1 -; GFX9-NEXT: v_subrev_u32_e32 v4, s5, v3 +; GFX9-NEXT: v_subrev_u32_e32 v4, s7, v3 ; GFX9-NEXT: v_subrev_u32_e32 v1, s4, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX9-NEXT: v_subrev_u32_e32 v0, s6, v0 +; GFX9-NEXT: v_subrev_u32_e32 v0, s5, v0 ; GFX9-NEXT: v_xor_b32_e32 v3, s11, v3 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_subrev_u32_e32 v3, s11, v3 @@ -2505,7 +2505,7 @@ define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX10-LABEL: sdivrem_v2i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[4:5], 0x10 +; GFX10-NEXT: s_load_dword s0, s[6:7], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_bfe_i32 s1, s0, 0x80018 ; GFX10-NEXT: s_bfe_i32 s3, s0, 0x80010 @@ -2517,36 +2517,36 @@ define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: s_xor_b32 s3, s3, s8 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s1 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GFX10-NEXT: s_sub_i32 s6, 0, s1 +; GFX10-NEXT: s_sub_i32 s4, 0, s1 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX10-NEXT: v_mul_lo_u32 v2, s6, v0 -; GFX10-NEXT: s_sub_i32 s6, 0, s3 -; GFX10-NEXT: v_mul_lo_u32 v3, s6, v1 -; GFX10-NEXT: s_bfe_i32 s6, s0, 0x80008 +; GFX10-NEXT: v_mul_lo_u32 v2, s4, v0 +; GFX10-NEXT: s_sub_i32 s4, 0, s3 +; GFX10-NEXT: v_mul_lo_u32 v3, s4, v1 +; GFX10-NEXT: s_bfe_i32 s4, s0, 0x80008 ; GFX10-NEXT: s_sext_i32_i8 s0, s0 -; GFX10-NEXT: s_ashr_i32 s9, s6, 31 +; GFX10-NEXT: s_ashr_i32 s9, s4, 31 ; GFX10-NEXT: s_ashr_i32 s10, s0, 31 ; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX10-NEXT: s_add_i32 s6, s6, s9 +; GFX10-NEXT: s_add_i32 s4, s4, s9 ; GFX10-NEXT: s_add_i32 s0, s0, s10 ; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX10-NEXT: s_xor_b32 s6, s6, s9 +; GFX10-NEXT: s_xor_b32 s4, s4, s9 ; GFX10-NEXT: s_xor_b32 s0, s0, s10 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 -; GFX10-NEXT: v_mul_hi_u32 v0, s6, v0 +; GFX10-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX10-NEXT: v_mul_hi_u32 v1, s0, v1 ; GFX10-NEXT: v_mul_lo_u32 v2, v0, s1 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 ; GFX10-NEXT: v_mul_lo_u32 v3, v1, s3 ; GFX10-NEXT: v_add_nc_u32_e32 v6, 1, v1 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, s6, v2 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX10-NEXT: v_sub_nc_u32_e32 v2, s4, v2 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, s0, v3 ; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s1, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v2 @@ -2596,25 +2596,25 @@ define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i16 %x, i16 %y) { ; GFX8-LABEL: sdiv_i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX8-NEXT: s_load_dword s4, s[6:7], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_i32 s0, s6, 0x100010 -; GFX8-NEXT: s_ashr_i32 s7, s0, 31 -; GFX8-NEXT: s_add_i32 s0, s0, s7 -; GFX8-NEXT: s_xor_b32 s8, s0, s7 +; GFX8-NEXT: s_bfe_i32 s0, s4, 0x100010 +; GFX8-NEXT: s_ashr_i32 s5, s0, 31 +; GFX8-NEXT: s_add_i32 s0, s0, s5 +; GFX8-NEXT: s_xor_b32 s8, s0, s5 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX8-NEXT: s_sub_i32 s0, 0, s8 +; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX8-NEXT: s_sext_i32_i16 s4, s6 -; GFX8-NEXT: s_ashr_i32 s5, s4, 31 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_ashr_i32 s6, s4, 31 +; GFX8-NEXT: s_add_i32 s4, s4, s6 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX8-NEXT: s_add_i32 s4, s4, s5 -; GFX8-NEXT: s_xor_b32 s4, s4, s5 -; GFX8-NEXT: s_xor_b32 s6, s5, s7 +; GFX8-NEXT: s_xor_b32 s4, s4, s6 +; GFX8-NEXT: s_xor_b32 s5, s6, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -2631,52 +2631,52 @@ define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s8, v3 -; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 +; GFX8-NEXT: v_xor_b32_e32 v2, s5, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s6, v2 -; GFX8-NEXT: v_xor_b32_e32 v3, s5, v3 +; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s5, v2 +; GFX8-NEXT: v_xor_b32_e32 v3, s6, v3 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s5, v3 +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s6, v3 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_short v[0:1], v3 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x10 +; GFX9-NEXT: s_load_dword s0, s[6:7], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_i32 s1, s0, 0x100010 -; GFX9-NEXT: s_ashr_i32 s6, s1, 31 -; GFX9-NEXT: s_add_i32 s1, s1, s6 -; GFX9-NEXT: s_xor_b32 s7, s1, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX9-NEXT: s_sub_i32 s1, 0, s7 +; GFX9-NEXT: s_ashr_i32 s4, s1, 31 +; GFX9-NEXT: s_add_i32 s1, s1, s4 +; GFX9-NEXT: s_xor_b32 s5, s1, s4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX9-NEXT: s_sub_i32 s1, 0, s5 ; GFX9-NEXT: s_sext_i32_i16 s0, s0 ; GFX9-NEXT: s_ashr_i32 s8, s0, 31 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_add_i32 s0, s0, s8 ; GFX9-NEXT: s_xor_b32 s9, s0, s8 +; GFX9-NEXT: s_xor_b32 s4, s8, s4 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_xor_b32 s4, s8, s6 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, v0, s7 +; GFX9-NEXT: v_mul_lo_u32 v1, v0, s5 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 ; GFX9-NEXT: v_sub_u32_e32 v1, s9, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 ; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 @@ -2689,18 +2689,19 @@ define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou ; ; GFX10-LABEL: sdiv_i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[4:5], 0x10 +; GFX10-NEXT: s_load_dword s0, s[6:7], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_bfe_i32 s1, s0, 0x100010 ; GFX10-NEXT: s_sext_i32_i16 s0, s0 -; GFX10-NEXT: s_ashr_i32 s6, s1, 31 +; GFX10-NEXT: s_ashr_i32 s4, s1, 31 ; GFX10-NEXT: s_ashr_i32 s8, s0, 31 -; GFX10-NEXT: s_add_i32 s1, s1, s6 +; GFX10-NEXT: s_add_i32 s1, s1, s4 ; GFX10-NEXT: s_add_i32 s0, s0, s8 -; GFX10-NEXT: s_xor_b32 s7, s1, s6 +; GFX10-NEXT: s_xor_b32 s5, s1, s4 ; GFX10-NEXT: s_xor_b32 s0, s0, s8 -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX10-NEXT: s_sub_i32 s1, 0, s7 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX10-NEXT: s_sub_i32 s1, 0, s5 +; GFX10-NEXT: s_xor_b32 s4, s8, s4 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -2708,18 +2709,17 @@ define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou ; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX10-NEXT: v_mul_lo_u32 v1, v0, s7 +; GFX10-NEXT: v_mul_lo_u32 v1, v0, s5 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s0, v1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: s_xor_b32 s4, s8, s6 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s7, v1 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v1 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s5, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s5, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s7, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s5, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s5, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -2741,14 +2741,14 @@ define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou define amdgpu_kernel void @sdivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i16> %x, <2 x i16> %y) { ; GFX8-LABEL: sdivrem_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x10 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_sext_i32_i16 s0, s3 ; GFX8-NEXT: s_ashr_i32 s8, s0, 31 ; GFX8-NEXT: s_add_i32 s0, s0, s8 ; GFX8-NEXT: s_xor_b32 s9, s0, s8 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GFX8-NEXT: s_sub_i32 s6, 0, s9 +; GFX8-NEXT: s_sub_i32 s4, 0, s9 ; GFX8-NEXT: s_bfe_i32 s1, s3, 0x100010 ; GFX8-NEXT: s_ashr_i32 s10, s1, 31 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -2760,10 +2760,10 @@ define amdgpu_kernel void @sdivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s11 ; GFX8-NEXT: s_ashr_i32 s3, s0, 31 ; GFX8-NEXT: s_add_i32 s0, s0, s3 -; GFX8-NEXT: v_mul_lo_u32 v1, s6, v0 +; GFX8-NEXT: v_mul_lo_u32 v1, s4, v0 ; GFX8-NEXT: s_xor_b32 s0, s0, s3 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0 @@ -2829,15 +2829,15 @@ define amdgpu_kernel void @sdivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX9-LABEL: sdivrem_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sext_i32_i16 s0, s7 +; GFX9-NEXT: s_sext_i32_i16 s0, s5 ; GFX9-NEXT: s_ashr_i32 s8, s0, 31 ; GFX9-NEXT: s_add_i32 s0, s0, s8 ; GFX9-NEXT: s_xor_b32 s9, s0, s8 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_bfe_i32 s5, s7, 0x100010 +; GFX9-NEXT: s_bfe_i32 s5, s5, 0x100010 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: s_ashr_i32 s7, s5, 31 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_add_i32 s5, s5, s7 @@ -2847,27 +2847,27 @@ define amdgpu_kernel void @sdivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: s_sub_i32 s10, 0, s9 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: s_sext_i32_i16 s4, s6 +; GFX9-NEXT: s_sext_i32_i16 s6, s4 ; GFX9-NEXT: v_mul_lo_u32 v2, s10, v0 -; GFX9-NEXT: s_ashr_i32 s10, s4, 31 +; GFX9-NEXT: s_ashr_i32 s10, s6, 31 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX9-NEXT: s_add_i32 s4, s4, s10 -; GFX9-NEXT: s_xor_b32 s4, s4, s10 +; GFX9-NEXT: s_add_i32 s6, s6, s10 +; GFX9-NEXT: s_xor_b32 s6, s6, s10 ; GFX9-NEXT: s_sub_i32 s11, 0, s5 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, s6, v0 ; GFX9-NEXT: v_mul_lo_u32 v2, s11, v1 -; GFX9-NEXT: s_bfe_i32 s6, s6, 0x100010 -; GFX9-NEXT: s_ashr_i32 s11, s6, 31 +; GFX9-NEXT: s_bfe_i32 s4, s4, 0x100010 +; GFX9-NEXT: s_ashr_i32 s11, s4, 31 ; GFX9-NEXT: v_mul_lo_u32 v3, v0, s9 ; GFX9-NEXT: v_mul_hi_u32 v2, v1, v2 -; GFX9-NEXT: s_add_i32 s6, s6, s11 +; GFX9-NEXT: s_add_i32 s4, s4, s11 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 -; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3 +; GFX9-NEXT: v_sub_u32_e32 v3, s6, v3 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 -; GFX9-NEXT: s_xor_b32 s4, s6, s11 +; GFX9-NEXT: s_xor_b32 s4, s4, s11 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX9-NEXT: v_subrev_u32_e32 v4, s9, v3 @@ -2912,7 +2912,7 @@ define amdgpu_kernel void @sdivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX10-LABEL: sdivrem_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_sext_i32_i16 s2, s1 ; GFX10-NEXT: s_bfe_i32 s1, s1, 0x100010 @@ -2924,36 +2924,36 @@ define amdgpu_kernel void @sdivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: s_xor_b32 s1, s1, s8 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s1 -; GFX10-NEXT: s_sub_i32 s6, 0, s2 +; GFX10-NEXT: s_sub_i32 s4, 0, s2 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX10-NEXT: v_mul_lo_u32 v2, s6, v0 -; GFX10-NEXT: s_sub_i32 s6, 0, s1 -; GFX10-NEXT: v_mul_lo_u32 v3, s6, v1 -; GFX10-NEXT: s_sext_i32_i16 s6, s0 +; GFX10-NEXT: v_mul_lo_u32 v2, s4, v0 +; GFX10-NEXT: s_sub_i32 s4, 0, s1 +; GFX10-NEXT: v_mul_lo_u32 v3, s4, v1 +; GFX10-NEXT: s_sext_i32_i16 s4, s0 ; GFX10-NEXT: s_bfe_i32 s0, s0, 0x100010 -; GFX10-NEXT: s_ashr_i32 s9, s6, 31 +; GFX10-NEXT: s_ashr_i32 s9, s4, 31 ; GFX10-NEXT: s_ashr_i32 s10, s0, 31 ; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX10-NEXT: s_add_i32 s6, s6, s9 +; GFX10-NEXT: s_add_i32 s4, s4, s9 ; GFX10-NEXT: s_add_i32 s0, s0, s10 ; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX10-NEXT: s_xor_b32 s6, s6, s9 +; GFX10-NEXT: s_xor_b32 s4, s4, s9 ; GFX10-NEXT: s_xor_b32 s0, s0, s10 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 -; GFX10-NEXT: v_mul_hi_u32 v0, s6, v0 +; GFX10-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX10-NEXT: v_mul_hi_u32 v1, s0, v1 ; GFX10-NEXT: v_mul_lo_u32 v2, v0, s2 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 ; GFX10-NEXT: v_mul_lo_u32 v3, v1, s1 ; GFX10-NEXT: v_add_nc_u32_e32 v6, 1, v1 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, s6, v2 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX10-NEXT: v_sub_nc_u32_e32 v2, s4, v2 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, s0, v3 ; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s2, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2 @@ -3002,25 +3002,25 @@ define amdgpu_kernel void @sdivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 define amdgpu_kernel void @sdivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i3 %x, i3 %y) { ; GFX8-LABEL: sdivrem_i3: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX8-NEXT: s_load_dword s4, s[6:7], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_i32 s0, s6, 0x30008 -; GFX8-NEXT: s_ashr_i32 s7, s0, 31 -; GFX8-NEXT: s_add_i32 s0, s0, s7 -; GFX8-NEXT: s_xor_b32 s8, s0, s7 +; GFX8-NEXT: s_bfe_i32 s0, s4, 0x30008 +; GFX8-NEXT: s_ashr_i32 s5, s0, 31 +; GFX8-NEXT: s_add_i32 s0, s0, s5 +; GFX8-NEXT: s_xor_b32 s8, s0, s5 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX8-NEXT: s_sub_i32 s0, 0, s8 +; GFX8-NEXT: s_bfe_i32 s4, s4, 0x30000 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX8-NEXT: s_bfe_i32 s4, s6, 0x30000 -; GFX8-NEXT: s_ashr_i32 s5, s4, 31 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_ashr_i32 s6, s4, 31 +; GFX8-NEXT: s_add_i32 s4, s4, s6 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX8-NEXT: s_add_i32 s4, s4, s5 -; GFX8-NEXT: s_xor_b32 s4, s4, s5 -; GFX8-NEXT: s_xor_b32 s6, s5, s7 +; GFX8-NEXT: s_xor_b32 s4, s4, s6 +; GFX8-NEXT: s_xor_b32 s5, s6, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -3037,12 +3037,12 @@ define amdgpu_kernel void @sdivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) % ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s8, v3 -; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 +; GFX8-NEXT: v_xor_b32_e32 v2, s5, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s6, v2 -; GFX8-NEXT: v_xor_b32_e32 v3, s5, v3 +; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s5, v2 +; GFX8-NEXT: v_xor_b32_e32 v3, s6, v3 ; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s5, v3 +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s6, v3 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_and_b32_e32 v2, 7, v3 @@ -3052,39 +3052,39 @@ define amdgpu_kernel void @sdivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) % ; ; GFX9-LABEL: sdivrem_i3: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x10 +; GFX9-NEXT: s_load_dword s0, s[6:7], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_i32 s1, s0, 0x30008 -; GFX9-NEXT: s_ashr_i32 s6, s1, 31 -; GFX9-NEXT: s_add_i32 s1, s1, s6 -; GFX9-NEXT: s_xor_b32 s7, s1, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX9-NEXT: s_sub_i32 s1, 0, s7 +; GFX9-NEXT: s_ashr_i32 s4, s1, 31 +; GFX9-NEXT: s_add_i32 s1, s1, s4 +; GFX9-NEXT: s_xor_b32 s5, s1, s4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX9-NEXT: s_sub_i32 s1, 0, s5 ; GFX9-NEXT: s_bfe_i32 s0, s0, 0x30000 ; GFX9-NEXT: s_ashr_i32 s8, s0, 31 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_add_i32 s0, s0, s8 ; GFX9-NEXT: s_xor_b32 s9, s0, s8 +; GFX9-NEXT: s_xor_b32 s4, s8, s4 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_xor_b32 s4, s8, s6 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, v0, s7 +; GFX9-NEXT: v_mul_lo_u32 v1, v0, s5 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 ; GFX9-NEXT: v_sub_u32_e32 v1, s9, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 ; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 @@ -3099,18 +3099,19 @@ define amdgpu_kernel void @sdivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) % ; ; GFX10-LABEL: sdivrem_i3: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[4:5], 0x10 +; GFX10-NEXT: s_load_dword s0, s[6:7], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_bfe_i32 s1, s0, 0x30008 ; GFX10-NEXT: s_bfe_i32 s0, s0, 0x30000 -; GFX10-NEXT: s_ashr_i32 s6, s1, 31 -; GFX10-NEXT: s_ashr_i32 s7, s0, 31 -; GFX10-NEXT: s_add_i32 s1, s1, s6 -; GFX10-NEXT: s_add_i32 s0, s0, s7 -; GFX10-NEXT: s_xor_b32 s1, s1, s6 -; GFX10-NEXT: s_xor_b32 s0, s0, s7 +; GFX10-NEXT: s_ashr_i32 s4, s1, 31 +; GFX10-NEXT: s_ashr_i32 s5, s0, 31 +; GFX10-NEXT: s_add_i32 s1, s1, s4 +; GFX10-NEXT: s_add_i32 s0, s0, s5 +; GFX10-NEXT: s_xor_b32 s1, s1, s4 +; GFX10-NEXT: s_xor_b32 s0, s0, s5 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s1 ; GFX10-NEXT: s_sub_i32 s2, 0, s1 +; GFX10-NEXT: s_xor_b32 s4, s5, s4 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -3128,15 +3129,14 @@ define amdgpu_kernel void @sdivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) % ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s1, v1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: s_xor_b32 s4, s7, s6 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_xor_b32_e32 v0, s4, v0 -; GFX10-NEXT: v_xor_b32_e32 v1, s7, v1 +; GFX10-NEXT: v_xor_b32_e32 v1, s5, v1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v0, s4, v0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s7, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s5, v1 ; GFX10-NEXT: v_and_b32_e32 v0, 7, v0 ; GFX10-NEXT: v_and_b32_e32 v1, 7, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -3153,25 +3153,25 @@ define amdgpu_kernel void @sdivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) % define amdgpu_kernel void @sdivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i27 %x, i27 %y) { ; GFX8-LABEL: sdivrem_i27: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_i32 s0, s7, 0x1b0000 -; GFX8-NEXT: s_ashr_i32 s7, s0, 31 -; GFX8-NEXT: s_add_i32 s0, s0, s7 -; GFX8-NEXT: s_xor_b32 s8, s0, s7 +; GFX8-NEXT: s_bfe_i32 s0, s5, 0x1b0000 +; GFX8-NEXT: s_ashr_i32 s5, s0, 31 +; GFX8-NEXT: s_add_i32 s0, s0, s5 +; GFX8-NEXT: s_xor_b32 s8, s0, s5 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX8-NEXT: s_sub_i32 s0, 0, s8 +; GFX8-NEXT: s_bfe_i32 s4, s4, 0x1b0000 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX8-NEXT: s_bfe_i32 s4, s6, 0x1b0000 -; GFX8-NEXT: s_ashr_i32 s5, s4, 31 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_ashr_i32 s6, s4, 31 +; GFX8-NEXT: s_add_i32 s4, s4, s6 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX8-NEXT: s_add_i32 s4, s4, s5 -; GFX8-NEXT: s_xor_b32 s4, s4, s5 -; GFX8-NEXT: s_xor_b32 s6, s5, s7 +; GFX8-NEXT: s_xor_b32 s4, s4, s6 +; GFX8-NEXT: s_xor_b32 s5, s6, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -3188,12 +3188,12 @@ define amdgpu_kernel void @sdivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s8, v3 -; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 +; GFX8-NEXT: v_xor_b32_e32 v2, s5, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s6, v2 -; GFX8-NEXT: v_xor_b32_e32 v3, s5, v3 +; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s5, v2 +; GFX8-NEXT: v_xor_b32_e32 v3, s6, v3 ; GFX8-NEXT: v_and_b32_e32 v2, 0x7ffffff, v2 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s5, v3 +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s6, v3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_and_b32_e32 v2, 0x7ffffff, v3 @@ -3203,39 +3203,39 @@ define amdgpu_kernel void @sdivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX9-LABEL: sdivrem_i27: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_i32 s1, s1, 0x1b0000 -; GFX9-NEXT: s_ashr_i32 s6, s1, 31 -; GFX9-NEXT: s_add_i32 s1, s1, s6 -; GFX9-NEXT: s_xor_b32 s7, s1, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX9-NEXT: s_sub_i32 s1, 0, s7 +; GFX9-NEXT: s_ashr_i32 s4, s1, 31 +; GFX9-NEXT: s_add_i32 s1, s1, s4 +; GFX9-NEXT: s_xor_b32 s5, s1, s4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX9-NEXT: s_sub_i32 s1, 0, s5 ; GFX9-NEXT: s_bfe_i32 s0, s0, 0x1b0000 ; GFX9-NEXT: s_ashr_i32 s8, s0, 31 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_add_i32 s0, s0, s8 ; GFX9-NEXT: s_xor_b32 s9, s0, s8 +; GFX9-NEXT: s_xor_b32 s4, s8, s4 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_xor_b32 s4, s8, s6 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, v0, s7 +; GFX9-NEXT: v_mul_lo_u32 v1, v0, s5 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 ; GFX9-NEXT: v_sub_u32_e32 v1, s9, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 ; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 @@ -3250,18 +3250,19 @@ define amdgpu_kernel void @sdivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX10-LABEL: sdivrem_i27: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_bfe_i32 s1, s1, 0x1b0000 ; GFX10-NEXT: s_bfe_i32 s0, s0, 0x1b0000 -; GFX10-NEXT: s_ashr_i32 s6, s1, 31 -; GFX10-NEXT: s_ashr_i32 s7, s0, 31 -; GFX10-NEXT: s_add_i32 s1, s1, s6 -; GFX10-NEXT: s_add_i32 s0, s0, s7 -; GFX10-NEXT: s_xor_b32 s1, s1, s6 -; GFX10-NEXT: s_xor_b32 s0, s0, s7 +; GFX10-NEXT: s_ashr_i32 s4, s1, 31 +; GFX10-NEXT: s_ashr_i32 s5, s0, 31 +; GFX10-NEXT: s_add_i32 s1, s1, s4 +; GFX10-NEXT: s_add_i32 s0, s0, s5 +; GFX10-NEXT: s_xor_b32 s1, s1, s4 +; GFX10-NEXT: s_xor_b32 s0, s0, s5 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s1 ; GFX10-NEXT: s_sub_i32 s2, 0, s1 +; GFX10-NEXT: s_xor_b32 s4, s5, s4 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -3279,15 +3280,14 @@ define amdgpu_kernel void @sdivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s1, v1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: s_xor_b32 s4, s7, s6 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_xor_b32_e32 v0, s4, v0 -; GFX10-NEXT: v_xor_b32_e32 v1, s7, v1 +; GFX10-NEXT: v_xor_b32_e32 v1, s5, v1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v0, s4, v0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s7, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s5, v1 ; GFX10-NEXT: v_and_b32_e32 v0, 0x7ffffff, v0 ; GFX10-NEXT: v_and_b32_e32 v1, 0x7ffffff, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll index 62d8b7d6f045c..0389cacb61390 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll @@ -6,32 +6,32 @@ define amdgpu_kernel void @udivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i32 %x, i32 %y) { ; GFX8-LABEL: udivrem_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX8-NEXT: s_sub_i32 s0, 0, s7 +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX8-NEXT: s_sub_i32 s0, 0, s5 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 -; GFX8-NEXT: v_mul_hi_u32 v2, s6, v0 +; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mul_lo_u32 v3, v2, s7 +; GFX8-NEXT: v_mul_lo_u32 v3, v2, s5 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s6, v3 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s4, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc @@ -41,30 +41,30 @@ define amdgpu_kernel void @udivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX9-LABEL: udivrem_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX9-NEXT: s_sub_i32 s0, 0, s7 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX9-NEXT: s_sub_i32 s0, 0, s5 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 -; GFX9-NEXT: v_mul_hi_u32 v0, s6, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, v0, s7 +; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX9-NEXT: v_mul_lo_u32 v1, v0, s5 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_sub_u32_e32 v1, s6, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 +; GFX9-NEXT: v_sub_u32_e32 v1, s4, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] @@ -73,28 +73,28 @@ define amdgpu_kernel void @udivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX10-LABEL: udivrem_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX10-NEXT: s_sub_i32 s0, 0, s7 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX10-NEXT: s_sub_i32 s0, 0, s5 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 -; GFX10-NEXT: v_mul_hi_u32 v0, s6, v0 -; GFX10-NEXT: v_mul_lo_u32 v1, v0, s7 +; GFX10-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX10-NEXT: v_mul_lo_u32 v1, v0, s5 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 -; GFX10-NEXT: v_sub_nc_u32_e32 v1, s6, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s7, v1 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v1 +; GFX10-NEXT: v_sub_nc_u32_e32 v1, s4, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s5, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s5, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s7, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s5, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s5, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo @@ -112,7 +112,7 @@ define amdgpu_kernel void @udivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i64 %x, i64 %y) { ; GFX8-LABEL: udivrem_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s11 ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s10 @@ -251,7 +251,7 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX9-LABEL: udivrem_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s11 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s10 @@ -384,7 +384,7 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX10-LABEL: udivrem_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s11 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s10 @@ -522,7 +522,7 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i32> %x, <2 x i32> %y) { ; GFX8-LABEL: udivrem_v2i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s10 ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s11 @@ -576,7 +576,7 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX9-LABEL: udivrem_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s14 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s15 @@ -627,7 +627,7 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX10-LABEL: udivrem_v2i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s14 @@ -685,8 +685,8 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <4 x i32> %x, <4 x i32> %y) { ; GFX8-LABEL: udivrem_v4i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s12 ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s13 @@ -783,7 +783,7 @@ define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX9-LABEL: udivrem_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s12 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s13 @@ -792,6 +792,7 @@ define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s14 +; GFX9-NEXT: s_sub_i32 s4, 0, s14 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -799,8 +800,7 @@ define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; GFX9-NEXT: v_mul_lo_u32 v2, s0, v0 ; GFX9-NEXT: v_mul_lo_u32 v3, s1, v1 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_sub_i32 s4, 0, s14 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 @@ -878,9 +878,9 @@ define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX10-LABEL: udivrem_v4i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s12 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s13 @@ -979,8 +979,8 @@ define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i64> %x, <2 x i64> %y) { ; GFX8-LABEL: udivrem_v2i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x20 -; GFX8-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[12:15], s[6:7], 0x20 +; GFX8-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s13 ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s12 @@ -1248,7 +1248,7 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX9-LABEL: udivrem_v2i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x20 +; GFX9-NEXT: s_load_dwordx4 s[12:15], s[6:7], 0x20 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s13 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s12 @@ -1257,7 +1257,7 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX9-NEXT: v_trunc_f32_e32 v2, v1 @@ -1510,7 +1510,7 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX10-LABEL: udivrem_v2i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x20 +; GFX10-NEXT: s_load_dwordx4 s[12:15], s[6:7], 0x20 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s13 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s15 @@ -1546,9 +1546,9 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_mad_u64_u32 v[5:6], s3, s2, v10, v[3:4] ; GFX10-NEXT: v_mul_lo_u32 v6, v9, v0 ; GFX10-NEXT: s_subb_u32 s3, 0, s15 -; GFX10-NEXT: v_mad_u64_u32 v[3:4], s6, s1, v7, v[4:5] +; GFX10-NEXT: v_mad_u64_u32 v[3:4], s4, s1, v7, v[4:5] ; GFX10-NEXT: v_mul_hi_u32 v4, v7, v0 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s6, s3, v8, v[5:6] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, s3, v8, v[5:6] ; GFX10-NEXT: v_mul_lo_u32 v1, v10, v2 ; GFX10-NEXT: v_mul_hi_u32 v5, v8, v2 ; GFX10-NEXT: v_mul_hi_u32 v2, v10, v2 @@ -1560,39 +1560,39 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_mul_hi_u32 v17, v8, v0 ; GFX10-NEXT: v_mul_hi_u32 v3, v9, v3 ; GFX10-NEXT: v_mul_hi_u32 v0, v10, v0 -; GFX10-NEXT: v_add_co_u32 v6, s6, v6, v12 -; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s6 -; GFX10-NEXT: v_add_co_u32 v11, s6, v13, v11 -; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s6 -; GFX10-NEXT: v_add_co_u32 v1, s6, v1, v15 -; GFX10-NEXT: v_cndmask_b32_e64 v15, 0, 1, s6 -; GFX10-NEXT: v_add_co_u32 v2, s6, v16, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v16, 0, 1, s6 -; GFX10-NEXT: v_add_co_u32 v4, s6, v6, v4 -; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s6 -; GFX10-NEXT: v_add_co_u32 v6, s6, v11, v14 -; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s6 -; GFX10-NEXT: v_add_co_u32 v1, s6, v1, v5 +; GFX10-NEXT: v_add_co_u32 v6, s4, v6, v12 +; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s4 +; GFX10-NEXT: v_add_co_u32 v11, s4, v13, v11 +; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s4 +; GFX10-NEXT: v_add_co_u32 v1, s4, v1, v15 +; GFX10-NEXT: v_cndmask_b32_e64 v15, 0, 1, s4 +; GFX10-NEXT: v_add_co_u32 v2, s4, v16, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v16, 0, 1, s4 +; GFX10-NEXT: v_add_co_u32 v4, s4, v6, v4 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4 +; GFX10-NEXT: v_add_co_u32 v6, s4, v11, v14 +; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s4 +; GFX10-NEXT: v_add_co_u32 v1, s4, v1, v5 ; GFX10-NEXT: v_add_nc_u32_e32 v4, v12, v4 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s6 -; GFX10-NEXT: v_add_co_u32 v2, s6, v2, v17 -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s6 -; GFX10-NEXT: v_add_co_u32 v4, s6, v6, v4 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s4 +; GFX10-NEXT: v_add_co_u32 v2, s4, v2, v17 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s4 +; GFX10-NEXT: v_add_co_u32 v4, s4, v6, v4 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v15, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v11, v13, v11 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s4 ; GFX10-NEXT: v_add_nc_u32_e32 v5, v16, v5 ; GFX10-NEXT: v_add_co_u32 v7, vcc_lo, v7, v4 -; GFX10-NEXT: v_add_co_u32 v1, s6, v2, v1 +; GFX10-NEXT: v_add_co_u32 v1, s4, v2, v1 ; GFX10-NEXT: v_add3_u32 v3, v11, v6, v3 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v3, vcc_lo ; GFX10-NEXT: v_add3_u32 v2, v5, v2, v0 ; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v8, v1 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s6, s0, v7, 0 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, s0, v7, 0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v10, v2, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[2:3], s6, s2, v8, 0 -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 +; GFX10-NEXT: v_mad_u64_u32 v[2:3], s4, s2, v8, 0 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 ; GFX10-NEXT: v_mul_hi_u32 v11, v9, v0 ; GFX10-NEXT: v_mad_u64_u32 v[4:5], s0, s0, v9, v[1:2] ; GFX10-NEXT: v_mad_u64_u32 v[5:6], s0, s2, v10, v[3:4] @@ -1772,34 +1772,34 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i8 %x, i8 %y) { ; GFX8-LABEL: udiv_i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX8-NEXT: s_load_dword s4, s[6:7], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s7, s6, 0x80008 -; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s7 +; GFX8-NEXT: s_bfe_u32 s5, s4, 0x80008 +; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX8-NEXT: s_sub_i32 s0, 0, s7 +; GFX8-NEXT: s_sub_i32 s0, 0, s5 +; GFX8-NEXT: s_and_b32 s4, s4, 0xff ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX8-NEXT: s_and_b32 s4, s6, 0xff +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mul_lo_u32 v3, v2, s7 +; GFX8-NEXT: v_mul_lo_u32 v3, v2, s5 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s4, v3 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc @@ -1809,32 +1809,32 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out ; ; GFX9-LABEL: udiv_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x10 +; GFX9-NEXT: s_load_dword s0, s[6:7], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s6, s0, 0x80008 -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s6 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x80008 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_i32 s1, 0, s6 -; GFX9-NEXT: s_and_b32 s7, s0, 0xff +; GFX9-NEXT: s_sub_i32 s1, 0, s4 +; GFX9-NEXT: s_and_b32 s5, s0, 0xff ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 -; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, v0, s6 +; GFX9-NEXT: v_mul_hi_u32 v0, s5, v0 +; GFX9-NEXT: v_mul_lo_u32 v1, v0, s4 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_sub_u32_e32 v1, s7, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v1 +; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s6, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s6, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_byte v2, v0, s[0:1] @@ -1843,12 +1843,12 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out ; ; GFX10-LABEL: udiv_i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[4:5], 0x10 +; GFX10-NEXT: s_load_dword s0, s[6:7], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bfe_u32 s6, s0, 0x80008 +; GFX10-NEXT: s_bfe_u32 s4, s0, 0x80008 ; GFX10-NEXT: s_and_b32 s0, s0, 0xff -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, s6 -; GFX10-NEXT: s_sub_i32 s1, 0, s6 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, s4 +; GFX10-NEXT: s_sub_i32 s1, 0, s4 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -1856,17 +1856,17 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out ; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX10-NEXT: v_mul_lo_u32 v1, v0, s6 +; GFX10-NEXT: v_mul_lo_u32 v1, v0, s4 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s0, v1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s6, v1 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v1 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s4, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s6, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s4, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo @@ -1884,8 +1884,8 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i8> %x, <2 x i8> %y) { ; GFX8-LABEL: udivrem_v2i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s0, s[4:5], 0x10 -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s0, s[6:7], 0x10 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_u32 s2, s0, 0x80010 ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 @@ -1949,55 +1949,55 @@ define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX9-LABEL: udivrem_v2i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x10 +; GFX9-NEXT: s_load_dword s0, s[6:7], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s6, s0, 0x80010 -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, s6 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x80010 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v0, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_i32 s1, 0, s6 +; GFX9-NEXT: s_sub_i32 s1, 0, s4 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_lshr_b32 s7, s0, 24 +; GFX9-NEXT: s_lshr_b32 s5, s0, 24 ; GFX9-NEXT: v_mul_lo_u32 v3, s1, v1 -; GFX9-NEXT: s_sub_i32 s2, 0, s7 +; GFX9-NEXT: s_sub_i32 s2, 0, s5 ; GFX9-NEXT: v_mul_lo_u32 v2, s2, v0 ; GFX9-NEXT: s_and_b32 s8, s0, 0xff ; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX9-NEXT: s_bfe_u32 s9, s0, 0x80008 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 ; GFX9-NEXT: v_mul_hi_u32 v1, s8, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 ; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0 -; GFX9-NEXT: v_mul_lo_u32 v3, v1, s6 +; GFX9-NEXT: v_mul_lo_u32 v3, v1, s4 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 -; GFX9-NEXT: v_mul_lo_u32 v2, v0, s7 +; GFX9-NEXT: v_mul_lo_u32 v2, v0, s5 ; GFX9-NEXT: v_sub_u32_e32 v3, s8, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s6, v3 +; GFX9-NEXT: v_subrev_u32_e32 v4, s4, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v3 ; GFX9-NEXT: v_sub_u32_e32 v2, s9, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s6, v3 +; GFX9-NEXT: v_subrev_u32_e32 v4, s4, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s7, v2 +; GFX9-NEXT: v_subrev_u32_e32 v4, s5, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX9-NEXT: v_subrev_u32_e32 v4, s7, v2 +; GFX9-NEXT: v_subrev_u32_e32 v4, s5, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -2012,7 +2012,7 @@ define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX10-LABEL: udivrem_v2i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[4:5], 0x10 +; GFX10-NEXT: s_load_dword s0, s[6:7], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v0, s0 ; GFX10-NEXT: s_bfe_u32 s1, s0, 0x80010 @@ -2020,7 +2020,7 @@ define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, s1 ; GFX10-NEXT: s_sub_i32 s3, 0, s2 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX10-NEXT: s_sub_i32 s6, 0, s1 +; GFX10-NEXT: s_sub_i32 s4, 0, s1 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 @@ -2029,8 +2029,8 @@ define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_mul_lo_u32 v2, s3, v0 ; GFX10-NEXT: s_bfe_u32 s3, s0, 0x80008 ; GFX10-NEXT: s_and_b32 s0, s0, 0xff -; GFX10-NEXT: v_mul_lo_u32 v3, s6, v1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX10-NEXT: v_mul_lo_u32 v3, s4, v1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 ; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 @@ -2081,34 +2081,34 @@ define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i16 %x, i16 %y) { ; GFX8-LABEL: udiv_i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX8-NEXT: s_load_dword s4, s[6:7], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshr_b32 s7, s6, 16 -; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX8-NEXT: s_sub_i32 s0, 0, s7 +; GFX8-NEXT: s_lshr_b32 s5, s4, 16 +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX8-NEXT: s_sub_i32 s0, 0, s5 +; GFX8-NEXT: s_and_b32 s4, s4, 0xffff ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX8-NEXT: s_and_b32 s4, s6, 0xffff +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mul_lo_u32 v3, v2, s7 +; GFX8-NEXT: v_mul_lo_u32 v3, v2, s5 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s4, v3 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc @@ -2118,32 +2118,32 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou ; ; GFX9-LABEL: udiv_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x10 +; GFX9-NEXT: s_load_dword s0, s[6:7], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s6, s0, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX9-NEXT: s_sub_i32 s1, 0, s6 -; GFX9-NEXT: s_and_b32 s7, s0, 0xffff +; GFX9-NEXT: s_lshr_b32 s4, s0, 16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX9-NEXT: s_sub_i32 s1, 0, s4 +; GFX9-NEXT: s_and_b32 s5, s0, 0xffff ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 -; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, v0, s6 +; GFX9-NEXT: v_mul_hi_u32 v0, s5, v0 +; GFX9-NEXT: v_mul_lo_u32 v1, v0, s4 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_sub_u32_e32 v1, s7, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v1 +; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s6, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s6, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v2, v0, s[0:1] @@ -2152,12 +2152,12 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou ; ; GFX10-LABEL: udiv_i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[4:5], 0x10 +; GFX10-NEXT: s_load_dword s0, s[6:7], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_lshr_b32 s6, s0, 16 +; GFX10-NEXT: s_lshr_b32 s4, s0, 16 ; GFX10-NEXT: s_and_b32 s0, s0, 0xffff -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX10-NEXT: s_sub_i32 s1, 0, s6 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX10-NEXT: s_sub_i32 s1, 0, s4 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -2165,17 +2165,17 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou ; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX10-NEXT: v_mul_lo_u32 v1, v0, s6 +; GFX10-NEXT: v_mul_lo_u32 v1, v0, s4 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s0, v1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s6, v1 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v1 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s4, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s6, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s4, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo @@ -2193,8 +2193,8 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou define amdgpu_kernel void @udivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i16> %x, <2 x i16> %y) { ; GFX8-LABEL: udivrem_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_and_b32 s2, s1, 0xffff ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s2 @@ -2258,7 +2258,7 @@ define amdgpu_kernel void @udivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX9-LABEL: udivrem_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s3, s1, 0xffff ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 @@ -2266,7 +2266,7 @@ define amdgpu_kernel void @udivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2 ; GFX9-NEXT: s_sub_i32 s1, 0, s3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_i32 s6, 0, s2 +; GFX9-NEXT: s_sub_i32 s4, 0, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -2274,10 +2274,10 @@ define amdgpu_kernel void @udivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_lo_u32 v2, s1, v0 ; GFX9-NEXT: s_lshr_b32 s1, s0, 16 -; GFX9-NEXT: v_mul_lo_u32 v3, s6, v1 +; GFX9-NEXT: v_mul_lo_u32 v3, s4, v1 ; GFX9-NEXT: s_and_b32 s0, s0, 0xffff ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 ; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 ; GFX9-NEXT: v_mul_hi_u32 v0, s0, v0 @@ -2319,14 +2319,14 @@ define amdgpu_kernel void @udivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX10-LABEL: udivrem_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s2, s1, 0xffff ; GFX10-NEXT: s_lshr_b32 s1, s1, 16 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s1 ; GFX10-NEXT: s_sub_i32 s3, 0, s2 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -2387,34 +2387,34 @@ define amdgpu_kernel void @udivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 define amdgpu_kernel void @udivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i3 %x, i3 %y) { ; GFX8-LABEL: udivrem_i3: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX8-NEXT: s_load_dword s4, s[6:7], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s7, s6, 0x30008 -; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s7 +; GFX8-NEXT: s_bfe_u32 s5, s4, 0x30008 +; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX8-NEXT: s_sub_i32 s0, 0, s7 +; GFX8-NEXT: s_sub_i32 s0, 0, s5 +; GFX8-NEXT: s_and_b32 s4, s4, 7 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX8-NEXT: s_and_b32 s4, s6, 7 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mul_lo_u32 v3, v2, s7 +; GFX8-NEXT: v_mul_lo_u32 v3, v2, s5 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s4, v3 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 ; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: flat_store_byte v[0:1], v2 @@ -2426,32 +2426,32 @@ define amdgpu_kernel void @udivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) % ; ; GFX9-LABEL: udivrem_i3: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x10 +; GFX9-NEXT: s_load_dword s0, s[6:7], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s6, s0, 0x30008 -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s6 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x30008 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_i32 s1, 0, s6 -; GFX9-NEXT: s_and_b32 s7, s0, 7 +; GFX9-NEXT: s_sub_i32 s1, 0, s4 +; GFX9-NEXT: s_and_b32 s5, s0, 7 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 -; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, v0, s6 +; GFX9-NEXT: v_mul_hi_u32 v0, s5, v0 +; GFX9-NEXT: v_mul_lo_u32 v1, v0, s4 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_sub_u32_e32 v1, s7, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v1 +; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s6, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s6, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2462,12 +2462,12 @@ define amdgpu_kernel void @udivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) % ; ; GFX10-LABEL: udivrem_i3: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[4:5], 0x10 +; GFX10-NEXT: s_load_dword s0, s[6:7], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bfe_u32 s6, s0, 0x30008 +; GFX10-NEXT: s_bfe_u32 s4, s0, 0x30008 ; GFX10-NEXT: s_and_b32 s0, s0, 7 -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, s6 -; GFX10-NEXT: s_sub_i32 s1, 0, s6 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, s4 +; GFX10-NEXT: s_sub_i32 s1, 0, s4 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -2475,17 +2475,17 @@ define amdgpu_kernel void @udivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) % ; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX10-NEXT: v_mul_lo_u32 v1, v0, s6 +; GFX10-NEXT: v_mul_lo_u32 v1, v0, s4 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s0, v1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s6, v1 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v1 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s4, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s6, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s4, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -2505,34 +2505,34 @@ define amdgpu_kernel void @udivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) % define amdgpu_kernel void @udivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i27 %x, i27 %y) { ; GFX8-LABEL: udivrem_i27: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s7, s7, 0x7ffffff -; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX8-NEXT: s_sub_i32 s0, 0, s7 +; GFX8-NEXT: s_and_b32 s5, s5, 0x7ffffff +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX8-NEXT: s_sub_i32 s0, 0, s5 +; GFX8-NEXT: s_and_b32 s4, s4, 0x7ffffff ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX8-NEXT: s_and_b32 s4, s6, 0x7ffffff +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mul_lo_u32 v3, v2, s7 +; GFX8-NEXT: v_mul_lo_u32 v3, v2, s5 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s4, v3 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 ; GFX8-NEXT: v_and_b32_e32 v2, 0x7ffffff, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -2544,32 +2544,32 @@ define amdgpu_kernel void @udivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX9-LABEL: udivrem_i27: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s6, s1, 0x7ffffff -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX9-NEXT: s_sub_i32 s1, 0, s6 -; GFX9-NEXT: s_and_b32 s7, s0, 0x7ffffff +; GFX9-NEXT: s_and_b32 s4, s1, 0x7ffffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX9-NEXT: s_sub_i32 s1, 0, s4 +; GFX9-NEXT: s_and_b32 s5, s0, 0x7ffffff ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 -; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, v0, s6 +; GFX9-NEXT: v_mul_hi_u32 v0, s5, v0 +; GFX9-NEXT: v_mul_lo_u32 v1, v0, s4 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_sub_u32_e32 v1, s7, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v1 +; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s6, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s6, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 0x7ffffff, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2580,12 +2580,12 @@ define amdgpu_kernel void @udivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX10-LABEL: udivrem_i27: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_and_b32 s6, s1, 0x7ffffff +; GFX10-NEXT: s_and_b32 s4, s1, 0x7ffffff ; GFX10-NEXT: s_and_b32 s0, s0, 0x7ffffff -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX10-NEXT: s_sub_i32 s1, 0, s6 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX10-NEXT: s_sub_i32 s1, 0, s4 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -2593,17 +2593,17 @@ define amdgpu_kernel void @udivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX10-NEXT: v_mul_lo_u32 v1, v0, s6 +; GFX10-NEXT: v_mul_lo_u32 v1, v0, s4 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s0, v1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s6, v1 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v1 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s4, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s6, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s4, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 0 diff --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll index 611a7b566070c..3720b9da52dcd 100644 --- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll @@ -9,8 +9,8 @@ define amdgpu_kernel void @v_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; VI-LABEL: v_test_add_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -33,13 +33,13 @@ define amdgpu_kernel void @v_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-LABEL: v_test_add_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_pk_add_u16 v1, v1, v2 @@ -49,13 +49,13 @@ define amdgpu_kernel void @v_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace ; GFX10-LABEL: v_test_add_v2i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_pk_add_u16 v1, v1, v2 @@ -65,10 +65,12 @@ define amdgpu_kernel void @v_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace ; GFX11-LABEL: v_test_add_v2i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -93,8 +95,8 @@ define amdgpu_kernel void @v_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @s_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in0, ptr addrspace(4) %in1) #1 { ; VI-LABEL: s_test_add_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[6:7], 0x0 ; VI-NEXT: s_load_dword s0, s[0:1], 0x0 @@ -114,37 +116,37 @@ define amdgpu_kernel void @s_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-LABEL: s_test_add_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_pk_add_u16 v1, s1, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_pk_add_u16 v1, s3, v1 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_test_add_v2i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 -; GFX10-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_add_u16 v1, s0, s1 +; GFX10-NEXT: v_pk_add_u16 v1, s2, s3 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_add_v2i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[6:7], 0x0 @@ -165,7 +167,7 @@ define amdgpu_kernel void @s_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @s_test_add_self_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in0) #1 { ; VI-LABEL: s_test_add_self_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -184,7 +186,7 @@ define amdgpu_kernel void @s_test_add_self_v2i16(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: s_test_add_self_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -195,7 +197,7 @@ define amdgpu_kernel void @s_test_add_self_v2i16(ptr addrspace(1) %out, ptr addr ; ; GFX10-LABEL: s_test_add_self_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -206,7 +208,7 @@ define amdgpu_kernel void @s_test_add_self_v2i16(ptr addrspace(1) %out, ptr addr ; ; GFX11-LABEL: s_test_add_self_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 @@ -226,7 +228,7 @@ define amdgpu_kernel void @s_test_add_self_v2i16(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @s_test_add_v2i16_kernarg(ptr addrspace(1) %out, <2 x i16> %a, <2 x i16> %b) #1 { ; VI-LABEL: s_test_add_v2i16_kernarg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s4, s2, 16 ; VI-NEXT: s_lshr_b32 s5, s3, 16 @@ -243,7 +245,7 @@ define amdgpu_kernel void @s_test_add_v2i16_kernarg(ptr addrspace(1) %out, <2 x ; ; GFX9-LABEL: s_test_add_v2i16_kernarg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s7 @@ -253,7 +255,7 @@ define amdgpu_kernel void @s_test_add_v2i16_kernarg(ptr addrspace(1) %out, <2 x ; ; GFX10-LABEL: s_test_add_v2i16_kernarg: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_pk_add_u16 v1, s6, s7 @@ -262,7 +264,7 @@ define amdgpu_kernel void @s_test_add_v2i16_kernarg(ptr addrspace(1) %out, <2 x ; ; GFX11-LABEL: s_test_add_v2i16_kernarg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_add_u16 v1, s2, s3 @@ -279,7 +281,7 @@ define amdgpu_kernel void @s_test_add_v2i16_kernarg(ptr addrspace(1) %out, <2 x define amdgpu_kernel void @v_test_add_v2i16_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { ; VI-LABEL: v_test_add_v2i16_constant: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: v_mov_b32_e32 v3, 0x1c8 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -298,7 +300,7 @@ define amdgpu_kernel void @v_test_add_v2i16_constant(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: v_test_add_v2i16_constant: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -311,7 +313,7 @@ define amdgpu_kernel void @v_test_add_v2i16_constant(ptr addrspace(1) %out, ptr ; ; GFX10-LABEL: v_test_add_v2i16_constant: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -323,8 +325,10 @@ define amdgpu_kernel void @v_test_add_v2i16_constant(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: v_test_add_v2i16_constant: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -346,7 +350,7 @@ define amdgpu_kernel void @v_test_add_v2i16_constant(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_test_add_v2i16_neg_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { ; VI-LABEL: v_test_add_v2i16_neg_constant: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: v_mov_b32_e32 v3, 0xfffffc21 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -365,7 +369,7 @@ define amdgpu_kernel void @v_test_add_v2i16_neg_constant(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_test_add_v2i16_neg_constant: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -378,7 +382,7 @@ define amdgpu_kernel void @v_test_add_v2i16_neg_constant(ptr addrspace(1) %out, ; ; GFX10-LABEL: v_test_add_v2i16_neg_constant: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -390,8 +394,10 @@ define amdgpu_kernel void @v_test_add_v2i16_neg_constant(ptr addrspace(1) %out, ; ; GFX11-LABEL: v_test_add_v2i16_neg_constant: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -412,7 +418,7 @@ define amdgpu_kernel void @v_test_add_v2i16_neg_constant(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { ; VI-LABEL: v_test_add_v2i16_inline_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: v_mov_b32_e32 v3, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -431,7 +437,7 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(ptr addrspace(1) %out, p ; ; GFX9-LABEL: v_test_add_v2i16_inline_neg1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -443,7 +449,7 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(ptr addrspace(1) %out, p ; ; GFX10-LABEL: v_test_add_v2i16_inline_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -455,8 +461,10 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(ptr addrspace(1) %out, p ; ; GFX11-LABEL: v_test_add_v2i16_inline_neg1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -477,7 +485,7 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(ptr addrspace(1) %out, p define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { ; VI-LABEL: v_test_add_v2i16_inline_lo_zero_hi: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -495,7 +503,7 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(ptr addrspace(1) % ; ; GFX9-LABEL: v_test_add_v2i16_inline_lo_zero_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -507,7 +515,7 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(ptr addrspace(1) % ; ; GFX10-LABEL: v_test_add_v2i16_inline_lo_zero_hi: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -519,8 +527,10 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(ptr addrspace(1) % ; ; GFX11-LABEL: v_test_add_v2i16_inline_lo_zero_hi: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -542,7 +552,7 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(ptr addrspace(1) % define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { ; VI-LABEL: v_test_add_v2i16_inline_fp_split: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: v_mov_b32_e32 v3, 0x3f80 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -560,7 +570,7 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_test_add_v2i16_inline_fp_split: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -572,7 +582,7 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(ptr addrspace(1) %ou ; ; GFX10-LABEL: v_test_add_v2i16_inline_fp_split: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -584,8 +594,10 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(ptr addrspace(1) %ou ; ; GFX11-LABEL: v_test_add_v2i16_inline_fp_split: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -607,8 +619,8 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(ptr addrspace(1) %ou define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; VI-LABEL: v_test_add_v2i16_zext_to_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -630,14 +642,14 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_test_add_v2i16_zext_to_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_add_u16 v0, v1, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 @@ -648,13 +660,13 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(ptr addrspace(1) %out, ; GFX10-LABEL: v_test_add_v2i16_zext_to_v2i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_add_u16 v0, v1, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -666,10 +678,12 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(ptr addrspace(1) %out, ; GFX11-LABEL: v_test_add_v2i16_zext_to_v2i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -699,8 +713,8 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; VI-LABEL: v_test_add_v2i16_zext_to_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -724,14 +738,14 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_test_add_v2i16_zext_to_v2i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v3, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_add_u16 v0, v2, v3 ; GFX9-NEXT: v_alignbit_b32 v2, 0, v0, 16 @@ -743,13 +757,13 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(ptr addrspace(1) %out, ; GFX10-LABEL: v_test_add_v2i16_zext_to_v2i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_add_u16 v0, v1, v2 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -762,8 +776,10 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(ptr addrspace(1) %out, ; GFX11-LABEL: v_test_add_v2i16_zext_to_v2i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -795,8 +811,8 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; VI-LABEL: v_test_add_v2i16_sext_to_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -820,14 +836,14 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i32(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_test_add_v2i16_sext_to_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_add_u16 v0, v1, v2 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 16, v0 @@ -838,13 +854,13 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i32(ptr addrspace(1) %out, ; GFX10-LABEL: v_test_add_v2i16_sext_to_v2i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_add_u16 v0, v1, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -856,10 +872,12 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i32(ptr addrspace(1) %out, ; GFX11-LABEL: v_test_add_v2i16_sext_to_v2i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -889,8 +907,8 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i32(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; VI-LABEL: v_test_add_v2i16_sext_to_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -915,13 +933,13 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i64(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_test_add_v2i16_sext_to_v2i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NEXT: global_load_dword v2, v0, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_add_u16 v1, v1, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1 @@ -935,14 +953,14 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i64(ptr addrspace(1) %out, ; GFX10-LABEL: v_test_add_v2i16_sext_to_v2i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-NEXT: global_load_dword v2, v0, s[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_add_u16 v0, v1, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 @@ -956,10 +974,12 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i64(ptr addrspace(1) %out, ; GFX11-LABEL: v_test_add_v2i16_sext_to_v2i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll index 559871d162e13..cf04dc8e59ead 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -39,7 +39,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX6-LABEL: udiv_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -72,7 +72,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX9-LABEL: udiv_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 @@ -137,7 +137,7 @@ define amdgpu_kernel void @urem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX6-LABEL: urem_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -167,7 +167,7 @@ define amdgpu_kernel void @urem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX9-LABEL: urem_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 @@ -241,7 +241,7 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX6-LABEL: sdiv_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -280,7 +280,7 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX9-LABEL: sdiv_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_abs_i32 s0, s7 @@ -359,7 +359,7 @@ define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX6-LABEL: srem_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -394,7 +394,7 @@ define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX9-LABEL: srem_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_abs_i32 s0, s7 @@ -452,15 +452,15 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; ; GFX6-LABEL: udiv_i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s2, s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[2:3], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshr_b32 s3, s2, 16 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX6-NEXT: s_and_b32 s2, s2, 0xffff -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s2 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_lshr_b32 s1, s0, 16 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s1 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 @@ -468,19 +468,20 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: udiv_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s3, s2, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX9-NEXT: s_and_b32 s2, s2, 0xffff -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2 +; GFX9-NEXT: s_lshr_b32 s1, s0, 16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 +; GFX9-NEXT: s_and_b32 s0, s0, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2 @@ -488,6 +489,7 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v3, v0, s[0:1] ; GFX9-NEXT: s_endpgm %r = udiv i16 %x, %y @@ -521,36 +523,37 @@ define amdgpu_kernel void @urem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; ; GFX6-LABEL: urem_i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshr_b32 s2, s4, 16 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX6-NEXT: s_and_b32 s3, s4, 0xffff -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_lshr_b32 s5, s4, 16 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX6-NEXT: s_and_b32 s0, s4, 0xffff +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2 ; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc -; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_mul_lo_u32 v0, v0, s5 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: urem_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s3, s2, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX9-NEXT: s_and_b32 s4, s2, 0xffff -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s4 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_lshr_b32 s5, s4, 16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX9-NEXT: s_and_b32 s0, s4, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2 @@ -559,8 +562,8 @@ define amdgpu_kernel void @urem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 -; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5 +; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm @@ -597,8 +600,8 @@ define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; ; GFX6-LABEL: sdiv_i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -623,27 +626,27 @@ define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; ; GFX9-LABEL: sdiv_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s0, s4, 16 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 -; GFX9-NEXT: s_sext_i32_i16 s1, s4 -; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s1 -; GFX9-NEXT: s_xor_b32 s0, s1, s0 +; GFX9-NEXT: s_ashr_i32 s2, s4, 16 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GFX9-NEXT: s_sext_i32_i16 s3, s4 +; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s3 +; GFX9-NEXT: s_xor_b32 s2, s3, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 -; GFX9-NEXT: s_ashr_i32 s0, s0, 30 -; GFX9-NEXT: s_or_b32 s4, s0, 1 +; GFX9-NEXT: s_ashr_i32 s2, s2, 30 +; GFX9-NEXT: s_or_b32 s4, s2, 1 ; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 ; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| -; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX9-NEXT: s_cselect_b32 s0, s4, 0 -; GFX9-NEXT: v_add_u32_e32 v0, s0, v3 -; GFX9-NEXT: global_store_short v1, v0, s[2:3] +; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v2|, |v0| +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_cselect_b32 s2, s4, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s2, v3 +; GFX9-NEXT: global_store_short v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm %r = sdiv i16 %x, %y store i16 %r, ptr addrspace(1) %out @@ -680,8 +683,8 @@ define amdgpu_kernel void @srem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; ; GFX6-LABEL: srem_i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_ashr_i32 s5, s4, 16 ; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s5 @@ -708,7 +711,8 @@ define amdgpu_kernel void @srem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; ; GFX9-LABEL: srem_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s5, s4, 16 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s5 @@ -718,7 +722,6 @@ define amdgpu_kernel void @srem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GFX9-NEXT: s_ashr_i32 s2, s2, 30 ; GFX9-NEXT: s_or_b32 s6, s2, 1 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2 ; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 @@ -730,7 +733,6 @@ define amdgpu_kernel void @srem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm %r = srem i16 %x, %y @@ -762,8 +764,8 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out, i8 %x, i8 %y) { ; ; GFX6-LABEL: udiv_i8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -781,13 +783,13 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out, i8 %x, i8 %y) { ; ; GFX9-LABEL: udiv_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, s2 +; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, s2 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, s4 ; GFX9-NEXT: v_mul_f32_e32 v1, v3, v1 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v1 @@ -827,8 +829,8 @@ define amdgpu_kernel void @urem_i8(ptr addrspace(1) %out, i8 %x, i8 %y) { ; ; GFX6-LABEL: urem_i8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_ubyte1_e32 v0, s4 @@ -849,13 +851,13 @@ define amdgpu_kernel void @urem_i8(ptr addrspace(1) %out, i8 %x, i8 %y) { ; ; GFX9-LABEL: urem_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, s2 +; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, s2 -; GFX9-NEXT: s_lshr_b32 s3, s2, 8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, s4 +; GFX9-NEXT: s_lshr_b32 s2, s4, 8 ; GFX9-NEXT: v_mul_f32_e32 v1, v2, v1 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v1 @@ -863,9 +865,8 @@ define amdgpu_kernel void @urem_i8(ptr addrspace(1) %out, i8 %x, i8 %y) { ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 -; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s2 +; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 ; GFX9-NEXT: global_store_byte v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm %r = urem i8 %x, %y @@ -901,8 +902,8 @@ define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out, i8 %x, i8 %y) { ; ; GFX6-LABEL: sdiv_i8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -927,27 +928,27 @@ define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out, i8 %x, i8 %y) { ; ; GFX9-LABEL: sdiv_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s0, s4, 0x80008 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 -; GFX9-NEXT: s_sext_i32_i8 s1, s4 -; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s1 -; GFX9-NEXT: s_xor_b32 s0, s1, s0 +; GFX9-NEXT: s_bfe_i32 s2, s4, 0x80008 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GFX9-NEXT: s_sext_i32_i8 s3, s4 +; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s3 +; GFX9-NEXT: s_xor_b32 s2, s3, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 -; GFX9-NEXT: s_ashr_i32 s0, s0, 30 -; GFX9-NEXT: s_or_b32 s4, s0, 1 +; GFX9-NEXT: s_ashr_i32 s2, s2, 30 +; GFX9-NEXT: s_or_b32 s4, s2, 1 ; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 ; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| -; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX9-NEXT: s_cselect_b32 s0, s4, 0 -; GFX9-NEXT: v_add_u32_e32 v0, s0, v3 -; GFX9-NEXT: global_store_byte v1, v0, s[2:3] +; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v2|, |v0| +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_cselect_b32 s2, s4, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s2, v3 +; GFX9-NEXT: global_store_byte v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm %r = sdiv i8 %x, %y store i8 %r, ptr addrspace(1) %out @@ -984,8 +985,8 @@ define amdgpu_kernel void @srem_i8(ptr addrspace(1) %out, i8 %x, i8 %y) { ; ; GFX6-LABEL: srem_i8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_bfe_i32 s2, s4, 0x80008 ; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s2 @@ -1013,30 +1014,30 @@ define amdgpu_kernel void @srem_i8(ptr addrspace(1) %out, i8 %x, i8 %y) { ; ; GFX9-LABEL: srem_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s0, s4, 0x80008 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 -; GFX9-NEXT: s_sext_i32_i8 s1, s4 -; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s1 -; GFX9-NEXT: s_xor_b32 s0, s1, s0 +; GFX9-NEXT: s_bfe_i32 s2, s4, 0x80008 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GFX9-NEXT: s_sext_i32_i8 s3, s4 +; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s3 +; GFX9-NEXT: s_xor_b32 s2, s3, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: s_ashr_i32 s2, s2, 30 ; GFX9-NEXT: s_lshr_b32 s5, s4, 8 -; GFX9-NEXT: s_or_b32 s6, s0, 1 +; GFX9-NEXT: s_or_b32 s6, s2, 1 ; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2 ; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| -; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX9-NEXT: s_cselect_b32 s0, s6, 0 -; GFX9-NEXT: v_add_u32_e32 v0, s0, v2 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_cselect_b32 s2, s6, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s2, v2 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 -; GFX9-NEXT: global_store_byte v1, v0, s[2:3] +; GFX9-NEXT: global_store_byte v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm %r = srem i8 %x, %y store i8 %r, ptr addrspace(1) %out @@ -1178,13 +1179,13 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; ; GFX6-LABEL: udiv_v4i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx8 s[8:15], s[0:1], 0xd -; GFX6-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s19, 0xf000 ; GFX6-NEXT: s_mov_b32 s18, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s12 -; GFX6-NEXT: s_sub_i32 s2, 0, s12 +; GFX6-NEXT: s_sub_i32 s0, 0, s12 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s13 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s14 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -1194,28 +1195,28 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v6 -; GFX6-NEXT: v_mul_lo_u32 v1, s2, v0 +; GFX6-NEXT: v_mul_lo_u32 v1, s0, v0 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_readfirstlane_b32 s2, v0 -; GFX6-NEXT: s_mul_i32 s2, s2, s12 -; GFX6-NEXT: s_sub_i32 s2, s8, s2 -; GFX6-NEXT: s_sub_i32 s3, s2, s12 -; GFX6-NEXT: s_cmp_ge_u32 s2, s12 +; GFX6-NEXT: v_readfirstlane_b32 s0, v0 +; GFX6-NEXT: s_mul_i32 s0, s0, s12 +; GFX6-NEXT: s_sub_i32 s0, s8, s0 +; GFX6-NEXT: s_sub_i32 s1, s0, s12 +; GFX6-NEXT: s_cmp_ge_u32 s0, s12 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; GFX6-NEXT: s_cselect_b32 s2, s3, s2 +; GFX6-NEXT: s_cselect_b32 s0, s1, s0 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX6-NEXT: s_cmp_ge_u32 s2, s12 -; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s0, s12 +; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX6-NEXT: s_sub_i32 s4, 0, s13 ; GFX6-NEXT: v_mul_lo_u32 v3, s4, v1 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 ; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[2:3] +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GFX6-NEXT: v_mul_hi_u32 v1, s9, v1 ; GFX6-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v4 @@ -1276,9 +1277,9 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; ; GFX9-LABEL: udiv_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 @@ -1498,34 +1499,36 @@ define amdgpu_kernel void @urem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; ; GFX6-LABEL: urem_v4i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 +; GFX6-NEXT: s_mov_b32 s15, 0xf000 +; GFX6-NEXT: s_mov_b32 s14, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX6-NEXT: s_sub_i32 s2, 0, s8 +; GFX6-NEXT: s_sub_i32 s0, 0, s8 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s9 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s2, v0 +; GFX6-NEXT: v_mul_lo_u32 v1, s0, v0 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s10 -; GFX6-NEXT: v_readfirstlane_b32 s2, v0 -; GFX6-NEXT: s_mul_i32 s2, s2, s8 -; GFX6-NEXT: s_sub_i32 s2, s4, s2 -; GFX6-NEXT: s_sub_i32 s3, s2, s8 -; GFX6-NEXT: s_cmp_ge_u32 s2, s8 -; GFX6-NEXT: s_cselect_b32 s2, s3, s2 -; GFX6-NEXT: s_sub_i32 s3, s2, s8 -; GFX6-NEXT: s_cmp_ge_u32 s2, s8 -; GFX6-NEXT: s_cselect_b32 s4, s3, s2 -; GFX6-NEXT: s_sub_i32 s2, 0, s9 -; GFX6-NEXT: v_mul_lo_u32 v0, s2, v1 +; GFX6-NEXT: v_readfirstlane_b32 s0, v0 +; GFX6-NEXT: s_mul_i32 s0, s0, s8 +; GFX6-NEXT: s_sub_i32 s0, s4, s0 +; GFX6-NEXT: s_sub_i32 s1, s0, s8 +; GFX6-NEXT: s_cmp_ge_u32 s0, s8 +; GFX6-NEXT: s_cselect_b32 s0, s1, s0 +; GFX6-NEXT: s_sub_i32 s1, s0, s8 +; GFX6-NEXT: s_cmp_ge_u32 s0, s8 +; GFX6-NEXT: s_cselect_b32 s0, s1, s0 +; GFX6-NEXT: s_sub_i32 s1, 0, s9 +; GFX6-NEXT: v_mul_lo_u32 v0, s1, v1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 @@ -1533,60 +1536,58 @@ define amdgpu_kernel void @urem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s11 -; GFX6-NEXT: v_readfirstlane_b32 s2, v0 -; GFX6-NEXT: s_mul_i32 s2, s2, s9 -; GFX6-NEXT: s_sub_i32 s2, s5, s2 -; GFX6-NEXT: s_sub_i32 s3, s2, s9 -; GFX6-NEXT: s_cmp_ge_u32 s2, s9 -; GFX6-NEXT: s_cselect_b32 s2, s3, s2 -; GFX6-NEXT: s_sub_i32 s3, s2, s9 -; GFX6-NEXT: s_cmp_ge_u32 s2, s9 -; GFX6-NEXT: s_cselect_b32 s5, s3, s2 -; GFX6-NEXT: s_sub_i32 s2, 0, s10 -; GFX6-NEXT: v_mul_lo_u32 v0, s2, v1 +; GFX6-NEXT: v_readfirstlane_b32 s1, v0 +; GFX6-NEXT: s_mul_i32 s1, s1, s9 +; GFX6-NEXT: s_sub_i32 s1, s5, s1 +; GFX6-NEXT: s_sub_i32 s4, s1, s9 +; GFX6-NEXT: s_cmp_ge_u32 s1, s9 +; GFX6-NEXT: s_cselect_b32 s1, s4, s1 +; GFX6-NEXT: s_sub_i32 s4, s1, s9 +; GFX6-NEXT: s_cmp_ge_u32 s1, s9 +; GFX6-NEXT: s_cselect_b32 s1, s4, s1 +; GFX6-NEXT: s_sub_i32 s4, 0, s10 +; GFX6-NEXT: v_mul_lo_u32 v0, s4, v1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_readfirstlane_b32 s2, v0 -; GFX6-NEXT: s_mul_i32 s2, s2, s10 -; GFX6-NEXT: s_sub_i32 s2, s6, s2 -; GFX6-NEXT: s_sub_i32 s3, s2, s10 -; GFX6-NEXT: s_cmp_ge_u32 s2, s10 -; GFX6-NEXT: s_cselect_b32 s2, s3, s2 -; GFX6-NEXT: s_sub_i32 s3, s2, s10 -; GFX6-NEXT: s_cmp_ge_u32 s2, s10 -; GFX6-NEXT: s_cselect_b32 s6, s3, s2 -; GFX6-NEXT: s_sub_i32 s2, 0, s11 -; GFX6-NEXT: v_mul_lo_u32 v0, s2, v1 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_readfirstlane_b32 s4, v0 +; GFX6-NEXT: s_mul_i32 s4, s4, s10 +; GFX6-NEXT: s_sub_i32 s4, s6, s4 +; GFX6-NEXT: s_sub_i32 s5, s4, s10 +; GFX6-NEXT: s_cmp_ge_u32 s4, s10 +; GFX6-NEXT: s_cselect_b32 s4, s5, s4 +; GFX6-NEXT: s_sub_i32 s5, s4, s10 +; GFX6-NEXT: s_cmp_ge_u32 s4, s10 +; GFX6-NEXT: s_cselect_b32 s4, s5, s4 +; GFX6-NEXT: s_sub_i32 s5, 0, s11 +; GFX6-NEXT: v_mul_lo_u32 v0, s5, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GFX6-NEXT: v_mul_hi_u32 v2, s7, v0 -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: v_readfirstlane_b32 s4, v2 -; GFX6-NEXT: s_mul_i32 s4, s4, s11 -; GFX6-NEXT: s_sub_i32 s4, s7, s4 -; GFX6-NEXT: s_sub_i32 s5, s4, s11 -; GFX6-NEXT: s_cmp_ge_u32 s4, s11 -; GFX6-NEXT: s_cselect_b32 s4, s5, s4 -; GFX6-NEXT: s_sub_i32 s5, s4, s11 -; GFX6-NEXT: s_cmp_ge_u32 s4, s11 -; GFX6-NEXT: s_cselect_b32 s4, s5, s4 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 -; GFX6-NEXT: v_mov_b32_e32 v3, s4 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_readfirstlane_b32 s0, v2 +; GFX6-NEXT: s_mul_i32 s0, s0, s11 +; GFX6-NEXT: s_sub_i32 s0, s7, s0 +; GFX6-NEXT: s_sub_i32 s1, s0, s11 +; GFX6-NEXT: s_cmp_ge_u32 s0, s11 +; GFX6-NEXT: s_cselect_b32 s0, s1, s0 +; GFX6-NEXT: s_sub_i32 s1, s0, s11 +; GFX6-NEXT: s_cmp_ge_u32 s0, s11 +; GFX6-NEXT: s_cselect_b32 s0, s1, s0 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: v_mov_b32_e32 v3, s0 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: urem_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 @@ -1842,34 +1843,34 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; ; GFX6-LABEL: sdiv_v4i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx8 s[8:15], s[0:1], 0xd -; GFX6-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s19, 0xf000 ; GFX6-NEXT: s_mov_b32 s18, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_abs_i32 s2, s12 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX6-NEXT: s_sub_i32 s3, 0, s2 +; GFX6-NEXT: s_abs_i32 s0, s12 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX6-NEXT: s_sub_i32 s1, 0, s0 ; GFX6-NEXT: s_xor_b32 s4, s8, s12 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0 -; GFX6-NEXT: s_abs_i32 s3, s8 +; GFX6-NEXT: v_mul_lo_u32 v1, s1, v0 +; GFX6-NEXT: s_abs_i32 s1, s8 ; GFX6-NEXT: s_ashr_i32 s8, s4, 31 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 ; GFX6-NEXT: v_readfirstlane_b32 s4, v0 -; GFX6-NEXT: s_mul_i32 s4, s4, s2 -; GFX6-NEXT: s_sub_i32 s3, s3, s4 -; GFX6-NEXT: s_sub_i32 s4, s3, s2 -; GFX6-NEXT: s_cmp_ge_u32 s3, s2 +; GFX6-NEXT: s_mul_i32 s4, s4, s0 +; GFX6-NEXT: s_sub_i32 s1, s1, s4 +; GFX6-NEXT: s_sub_i32 s4, s1, s0 +; GFX6-NEXT: s_cmp_ge_u32 s1, s0 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 -; GFX6-NEXT: s_cselect_b32 s3, s4, s3 +; GFX6-NEXT: s_cselect_b32 s1, s4, s1 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX6-NEXT: s_cmp_ge_u32 s3, s2 -; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s1, s0 +; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX6-NEXT: s_abs_i32 s4, s13 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s4 ; GFX6-NEXT: s_sub_i32 s5, 0, s4 @@ -1877,7 +1878,7 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX6-NEXT: s_xor_b32 s6, s9, s13 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 -; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1] ; GFX6-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX6-NEXT: v_xor_b32_e32 v0, s8, v0 @@ -1964,17 +1965,16 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; ; GFX9-LABEL: sdiv_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_abs_i32 s2, s8 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX9-NEXT: s_xor_b32 s3, s4, s8 -; GFX9-NEXT: s_sub_i32 s8, 0, s2 +; GFX9-NEXT: s_abs_i32 s0, s8 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX9-NEXT: s_xor_b32 s1, s4, s8 +; GFX9-NEXT: s_sub_i32 s8, 0, s0 ; GFX9-NEXT: s_abs_i32 s4, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_ashr_i32 s3, s3, 31 +; GFX9-NEXT: s_ashr_i32 s1, s1, 31 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s12, v0 @@ -1982,81 +1982,82 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX9-NEXT: s_mul_hi_u32 s8, s12, s8 ; GFX9-NEXT: s_add_i32 s12, s12, s8 ; GFX9-NEXT: s_mul_hi_u32 s8, s4, s12 -; GFX9-NEXT: s_mul_i32 s12, s8, s2 +; GFX9-NEXT: s_mul_i32 s12, s8, s0 ; GFX9-NEXT: s_sub_i32 s4, s4, s12 ; GFX9-NEXT: s_add_i32 s13, s8, 1 -; GFX9-NEXT: s_sub_i32 s12, s4, s2 -; GFX9-NEXT: s_cmp_ge_u32 s4, s2 +; GFX9-NEXT: s_sub_i32 s12, s4, s0 +; GFX9-NEXT: s_cmp_ge_u32 s4, s0 ; GFX9-NEXT: s_cselect_b32 s8, s13, s8 ; GFX9-NEXT: s_cselect_b32 s4, s12, s4 ; GFX9-NEXT: s_add_i32 s12, s8, 1 -; GFX9-NEXT: s_cmp_ge_u32 s4, s2 -; GFX9-NEXT: s_cselect_b32 s2, s12, s8 +; GFX9-NEXT: s_cmp_ge_u32 s4, s0 +; GFX9-NEXT: s_cselect_b32 s0, s12, s8 ; GFX9-NEXT: s_abs_i32 s4, s9 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX9-NEXT: s_xor_b32 s2, s2, s3 +; GFX9-NEXT: s_xor_b32 s0, s0, s1 ; GFX9-NEXT: s_xor_b32 s8, s5, s9 ; GFX9-NEXT: s_sub_i32 s9, 0, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_i32 s2, s2, s3 +; GFX9-NEXT: s_sub_i32 s12, s0, s1 ; GFX9-NEXT: s_abs_i32 s5, s5 ; GFX9-NEXT: s_ashr_i32 s8, s8, 31 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s3, v0 -; GFX9-NEXT: s_mul_i32 s9, s9, s3 -; GFX9-NEXT: s_mul_hi_u32 s9, s3, s9 -; GFX9-NEXT: s_add_i32 s3, s3, s9 -; GFX9-NEXT: s_mul_hi_u32 s3, s5, s3 -; GFX9-NEXT: s_mul_i32 s9, s3, s4 -; GFX9-NEXT: s_sub_i32 s5, s5, s9 -; GFX9-NEXT: s_add_i32 s12, s3, 1 -; GFX9-NEXT: s_sub_i32 s9, s5, s4 -; GFX9-NEXT: s_cmp_ge_u32 s5, s4 -; GFX9-NEXT: s_cselect_b32 s3, s12, s3 -; GFX9-NEXT: s_cselect_b32 s5, s9, s5 -; GFX9-NEXT: s_add_i32 s9, s3, 1 -; GFX9-NEXT: s_cmp_ge_u32 s5, s4 -; GFX9-NEXT: s_cselect_b32 s3, s9, s3 -; GFX9-NEXT: s_abs_i32 s4, s10 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX9-NEXT: s_xor_b32 s3, s3, s8 -; GFX9-NEXT: s_sub_i32 s9, 0, s4 -; GFX9-NEXT: s_sub_i32 s3, s3, s8 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: s_mul_i32 s9, s9, s0 +; GFX9-NEXT: s_mul_hi_u32 s1, s0, s9 +; GFX9-NEXT: s_add_i32 s0, s0, s1 +; GFX9-NEXT: s_mul_hi_u32 s0, s5, s0 +; GFX9-NEXT: s_mul_i32 s1, s0, s4 +; GFX9-NEXT: s_sub_i32 s1, s5, s1 +; GFX9-NEXT: s_add_i32 s9, s0, 1 +; GFX9-NEXT: s_sub_i32 s5, s1, s4 +; GFX9-NEXT: s_cmp_ge_u32 s1, s4 +; GFX9-NEXT: s_cselect_b32 s0, s9, s0 +; GFX9-NEXT: s_cselect_b32 s1, s5, s1 +; GFX9-NEXT: s_add_i32 s5, s0, 1 +; GFX9-NEXT: s_cmp_ge_u32 s1, s4 +; GFX9-NEXT: s_cselect_b32 s0, s5, s0 +; GFX9-NEXT: s_abs_i32 s1, s10 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 +; GFX9-NEXT: s_xor_b32 s0, s0, s8 +; GFX9-NEXT: s_xor_b32 s4, s6, s10 +; GFX9-NEXT: s_abs_i32 s5, s6 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_xor_b32 s5, s6, s10 -; GFX9-NEXT: s_abs_i32 s6, s6 -; GFX9-NEXT: s_ashr_i32 s5, s5, 31 +; GFX9-NEXT: s_sub_i32 s6, 0, s1 +; GFX9-NEXT: s_sub_i32 s8, s0, s8 +; GFX9-NEXT: s_ashr_i32 s4, s4, 31 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_readfirstlane_b32 s8, v0 -; GFX9-NEXT: s_mul_i32 s9, s9, s8 -; GFX9-NEXT: s_mul_hi_u32 s9, s8, s9 -; GFX9-NEXT: s_add_i32 s8, s8, s9 -; GFX9-NEXT: s_mul_hi_u32 s8, s6, s8 -; GFX9-NEXT: s_mul_i32 s9, s8, s4 -; GFX9-NEXT: s_sub_i32 s6, s6, s9 -; GFX9-NEXT: s_add_i32 s10, s8, 1 -; GFX9-NEXT: s_sub_i32 s9, s6, s4 -; GFX9-NEXT: s_cmp_ge_u32 s6, s4 -; GFX9-NEXT: s_cselect_b32 s8, s10, s8 -; GFX9-NEXT: s_cselect_b32 s6, s9, s6 -; GFX9-NEXT: s_add_i32 s9, s8, 1 -; GFX9-NEXT: s_cmp_ge_u32 s6, s4 -; GFX9-NEXT: s_cselect_b32 s4, s9, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: s_mul_i32 s6, s6, s0 +; GFX9-NEXT: s_mul_hi_u32 s6, s0, s6 +; GFX9-NEXT: s_add_i32 s0, s0, s6 +; GFX9-NEXT: s_mul_hi_u32 s0, s5, s0 +; GFX9-NEXT: s_mul_i32 s6, s0, s1 +; GFX9-NEXT: s_sub_i32 s5, s5, s6 +; GFX9-NEXT: s_add_i32 s9, s0, 1 +; GFX9-NEXT: s_sub_i32 s6, s5, s1 +; GFX9-NEXT: s_cmp_ge_u32 s5, s1 +; GFX9-NEXT: s_cselect_b32 s0, s9, s0 +; GFX9-NEXT: s_cselect_b32 s5, s6, s5 +; GFX9-NEXT: s_add_i32 s6, s0, 1 +; GFX9-NEXT: s_cmp_ge_u32 s5, s1 +; GFX9-NEXT: s_cselect_b32 s5, s6, s0 ; GFX9-NEXT: s_abs_i32 s6, s11 ; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s6 -; GFX9-NEXT: s_xor_b32 s4, s4, s5 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: s_xor_b32 s5, s5, s4 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_xor_b32 s2, s7, s11 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX9-NEXT: s_abs_i32 s3, s7 ; GFX9-NEXT: s_sub_i32 s7, 0, s6 -; GFX9-NEXT: s_sub_i32 s4, s4, s5 +; GFX9-NEXT: s_sub_i32 s4, s5, s4 ; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX9-NEXT: s_ashr_i32 s2, s2, 31 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 ; GFX9-NEXT: v_readfirstlane_b32 s5, v2 ; GFX9-NEXT: s_mul_i32 s7, s7, s5 ; GFX9-NEXT: s_mul_hi_u32 s7, s5, s7 @@ -2076,6 +2077,7 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX9-NEXT: s_sub_i32 s2, s3, s2 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm %r = sdiv <4 x i32> %x, %y @@ -2242,35 +2244,34 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; ; GFX6-LABEL: srem_v4i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_abs_i32 s2, s8 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX6-NEXT: s_sub_i32 s3, 0, s2 +; GFX6-NEXT: s_abs_i32 s0, s8 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX6-NEXT: s_sub_i32 s1, 0, s0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0 -; GFX6-NEXT: s_abs_i32 s3, s4 +; GFX6-NEXT: v_mul_lo_u32 v1, s1, v0 +; GFX6-NEXT: s_abs_i32 s1, s4 ; GFX6-NEXT: s_ashr_i32 s4, s4, 31 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 -; GFX6-NEXT: s_mul_i32 s8, s8, s2 -; GFX6-NEXT: s_sub_i32 s3, s3, s8 -; GFX6-NEXT: s_sub_i32 s8, s3, s2 -; GFX6-NEXT: s_cmp_ge_u32 s3, s2 -; GFX6-NEXT: s_cselect_b32 s3, s8, s3 -; GFX6-NEXT: s_sub_i32 s8, s3, s2 -; GFX6-NEXT: s_cmp_ge_u32 s3, s2 -; GFX6-NEXT: s_cselect_b32 s2, s8, s3 -; GFX6-NEXT: s_abs_i32 s3, s9 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX6-NEXT: s_sub_i32 s8, 0, s3 -; GFX6-NEXT: s_xor_b32 s2, s2, s4 -; GFX6-NEXT: s_sub_i32 s4, s2, s4 +; GFX6-NEXT: s_mul_i32 s8, s8, s0 +; GFX6-NEXT: s_sub_i32 s1, s1, s8 +; GFX6-NEXT: s_sub_i32 s8, s1, s0 +; GFX6-NEXT: s_cmp_ge_u32 s1, s0 +; GFX6-NEXT: s_cselect_b32 s1, s8, s1 +; GFX6-NEXT: s_sub_i32 s8, s1, s0 +; GFX6-NEXT: s_cmp_ge_u32 s1, s0 +; GFX6-NEXT: s_cselect_b32 s0, s8, s1 +; GFX6-NEXT: s_abs_i32 s1, s9 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s1 +; GFX6-NEXT: s_sub_i32 s8, 0, s1 +; GFX6-NEXT: s_xor_b32 s0, s0, s4 +; GFX6-NEXT: s_sub_i32 s0, s0, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -2280,21 +2281,22 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 -; GFX6-NEXT: v_readfirstlane_b32 s2, v0 -; GFX6-NEXT: s_mul_i32 s2, s2, s3 -; GFX6-NEXT: s_sub_i32 s2, s8, s2 -; GFX6-NEXT: s_sub_i32 s8, s2, s3 -; GFX6-NEXT: s_cmp_ge_u32 s2, s3 -; GFX6-NEXT: s_cselect_b32 s2, s8, s2 -; GFX6-NEXT: s_sub_i32 s8, s2, s3 -; GFX6-NEXT: s_cmp_ge_u32 s2, s3 -; GFX6-NEXT: s_cselect_b32 s2, s8, s2 -; GFX6-NEXT: s_abs_i32 s3, s10 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX6-NEXT: s_sub_i32 s8, 0, s3 -; GFX6-NEXT: s_xor_b32 s2, s2, s5 -; GFX6-NEXT: s_sub_i32 s5, s2, s5 +; GFX6-NEXT: v_readfirstlane_b32 s4, v0 +; GFX6-NEXT: s_mul_i32 s4, s4, s1 +; GFX6-NEXT: s_sub_i32 s4, s8, s4 +; GFX6-NEXT: s_sub_i32 s8, s4, s1 +; GFX6-NEXT: s_cmp_ge_u32 s4, s1 +; GFX6-NEXT: s_cselect_b32 s4, s8, s4 +; GFX6-NEXT: s_sub_i32 s8, s4, s1 +; GFX6-NEXT: s_cmp_ge_u32 s4, s1 +; GFX6-NEXT: s_cselect_b32 s1, s8, s4 +; GFX6-NEXT: s_abs_i32 s4, s10 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX6-NEXT: s_sub_i32 s8, 0, s4 +; GFX6-NEXT: s_xor_b32 s1, s1, s5 +; GFX6-NEXT: s_sub_i32 s1, s1, s5 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX6-NEXT: s_mov_b32 s10, -1 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_lo_u32 v1, s8, v0 @@ -2303,59 +2305,59 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 -; GFX6-NEXT: v_readfirstlane_b32 s2, v0 -; GFX6-NEXT: s_mul_i32 s2, s2, s3 -; GFX6-NEXT: s_sub_i32 s2, s8, s2 -; GFX6-NEXT: s_sub_i32 s8, s2, s3 -; GFX6-NEXT: s_cmp_ge_u32 s2, s3 -; GFX6-NEXT: s_cselect_b32 s2, s8, s2 -; GFX6-NEXT: s_sub_i32 s8, s2, s3 -; GFX6-NEXT: s_cmp_ge_u32 s2, s3 -; GFX6-NEXT: s_cselect_b32 s8, s8, s2 -; GFX6-NEXT: s_abs_i32 s9, s11 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GFX6-NEXT: s_sub_i32 s2, 0, s9 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: v_readfirstlane_b32 s5, v0 +; GFX6-NEXT: s_mul_i32 s5, s5, s4 +; GFX6-NEXT: s_sub_i32 s5, s8, s5 +; GFX6-NEXT: s_sub_i32 s8, s5, s4 +; GFX6-NEXT: s_cmp_ge_u32 s5, s4 +; GFX6-NEXT: s_cselect_b32 s5, s8, s5 +; GFX6-NEXT: s_sub_i32 s8, s5, s4 +; GFX6-NEXT: s_cmp_ge_u32 s5, s4 +; GFX6-NEXT: s_cselect_b32 s4, s8, s5 +; GFX6-NEXT: s_abs_i32 s5, s11 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX6-NEXT: s_sub_i32 s8, 0, s5 +; GFX6-NEXT: s_mov_b32 s11, 0xf000 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v0 -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: s_abs_i32 s4, s7 -; GFX6-NEXT: v_mul_lo_u32 v1, s2, v2 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: s_abs_i32 s0, s7 +; GFX6-NEXT: v_mul_lo_u32 v1, s8, v2 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 +; GFX6-NEXT: s_xor_b32 s2, s4, s6 +; GFX6-NEXT: s_sub_i32 s2, s2, s6 ; GFX6-NEXT: v_mul_hi_u32 v3, v2, v1 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: s_ashr_i32 s5, s7, 31 -; GFX6-NEXT: s_xor_b32 s7, s8, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_ashr_i32 s1, s7, 31 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_mul_hi_u32 v2, s4, v2 -; GFX6-NEXT: s_sub_i32 s6, s7, s6 -; GFX6-NEXT: v_readfirstlane_b32 s7, v2 -; GFX6-NEXT: s_mul_i32 s7, s7, s9 -; GFX6-NEXT: s_sub_i32 s4, s4, s7 -; GFX6-NEXT: s_sub_i32 s7, s4, s9 -; GFX6-NEXT: s_cmp_ge_u32 s4, s9 -; GFX6-NEXT: s_cselect_b32 s4, s7, s4 -; GFX6-NEXT: s_sub_i32 s7, s4, s9 -; GFX6-NEXT: s_cmp_ge_u32 s4, s9 -; GFX6-NEXT: s_cselect_b32 s4, s7, s4 -; GFX6-NEXT: s_xor_b32 s4, s4, s5 -; GFX6-NEXT: s_sub_i32 s4, s4, s5 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 -; GFX6-NEXT: v_mov_b32_e32 v3, s4 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX6-NEXT: v_mul_hi_u32 v2, s0, v2 +; GFX6-NEXT: v_readfirstlane_b32 s3, v2 +; GFX6-NEXT: s_mul_i32 s3, s3, s5 +; GFX6-NEXT: s_sub_i32 s0, s0, s3 +; GFX6-NEXT: s_sub_i32 s3, s0, s5 +; GFX6-NEXT: s_cmp_ge_u32 s0, s5 +; GFX6-NEXT: s_cselect_b32 s0, s3, s0 +; GFX6-NEXT: s_sub_i32 s3, s0, s5 +; GFX6-NEXT: s_cmp_ge_u32 s0, s5 +; GFX6-NEXT: s_cselect_b32 s0, s3, s0 +; GFX6-NEXT: s_xor_b32 s0, s0, s1 +; GFX6-NEXT: s_sub_i32 s0, s0, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s2 +; GFX6-NEXT: v_mov_b32_e32 v3, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: srem_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_abs_i32 s2, s8 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX9-NEXT: s_sub_i32 s8, 0, s2 -; GFX9-NEXT: s_ashr_i32 s3, s4, 31 +; GFX9-NEXT: s_abs_i32 s0, s8 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX9-NEXT: s_sub_i32 s8, 0, s0 +; GFX9-NEXT: s_ashr_i32 s1, s4, 31 ; GFX9-NEXT: s_abs_i32 s4, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -2365,72 +2367,73 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX9-NEXT: s_mul_hi_u32 s8, s12, s8 ; GFX9-NEXT: s_add_i32 s12, s12, s8 ; GFX9-NEXT: s_mul_hi_u32 s8, s4, s12 -; GFX9-NEXT: s_mul_i32 s8, s8, s2 +; GFX9-NEXT: s_mul_i32 s8, s8, s0 ; GFX9-NEXT: s_sub_i32 s4, s4, s8 -; GFX9-NEXT: s_sub_i32 s8, s4, s2 -; GFX9-NEXT: s_cmp_ge_u32 s4, s2 +; GFX9-NEXT: s_sub_i32 s8, s4, s0 +; GFX9-NEXT: s_cmp_ge_u32 s4, s0 ; GFX9-NEXT: s_cselect_b32 s4, s8, s4 -; GFX9-NEXT: s_sub_i32 s8, s4, s2 -; GFX9-NEXT: s_cmp_ge_u32 s4, s2 -; GFX9-NEXT: s_cselect_b32 s2, s8, s4 +; GFX9-NEXT: s_sub_i32 s8, s4, s0 +; GFX9-NEXT: s_cmp_ge_u32 s4, s0 +; GFX9-NEXT: s_cselect_b32 s0, s8, s4 ; GFX9-NEXT: s_abs_i32 s4, s9 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX9-NEXT: s_xor_b32 s2, s2, s3 +; GFX9-NEXT: s_xor_b32 s0, s0, s1 ; GFX9-NEXT: s_sub_i32 s9, 0, s4 -; GFX9-NEXT: s_sub_i32 s2, s2, s3 +; GFX9-NEXT: s_sub_i32 s12, s0, s1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_ashr_i32 s8, s5, 31 ; GFX9-NEXT: s_abs_i32 s5, s5 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s3, v0 -; GFX9-NEXT: s_mul_i32 s9, s9, s3 -; GFX9-NEXT: s_mul_hi_u32 s9, s3, s9 -; GFX9-NEXT: s_add_i32 s3, s3, s9 -; GFX9-NEXT: s_mul_hi_u32 s3, s5, s3 -; GFX9-NEXT: s_mul_i32 s3, s3, s4 -; GFX9-NEXT: s_sub_i32 s3, s5, s3 -; GFX9-NEXT: s_sub_i32 s5, s3, s4 -; GFX9-NEXT: s_cmp_ge_u32 s3, s4 -; GFX9-NEXT: s_cselect_b32 s3, s5, s3 -; GFX9-NEXT: s_sub_i32 s5, s3, s4 -; GFX9-NEXT: s_cmp_ge_u32 s3, s4 -; GFX9-NEXT: s_cselect_b32 s3, s5, s3 -; GFX9-NEXT: s_abs_i32 s4, s10 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX9-NEXT: s_xor_b32 s3, s3, s8 -; GFX9-NEXT: s_sub_i32 s9, 0, s4 -; GFX9-NEXT: s_sub_i32 s3, s3, s8 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: s_mul_i32 s9, s9, s0 +; GFX9-NEXT: s_mul_hi_u32 s1, s0, s9 +; GFX9-NEXT: s_add_i32 s0, s0, s1 +; GFX9-NEXT: s_mul_hi_u32 s0, s5, s0 +; GFX9-NEXT: s_mul_i32 s0, s0, s4 +; GFX9-NEXT: s_sub_i32 s0, s5, s0 +; GFX9-NEXT: s_sub_i32 s1, s0, s4 +; GFX9-NEXT: s_cmp_ge_u32 s0, s4 +; GFX9-NEXT: s_cselect_b32 s0, s1, s0 +; GFX9-NEXT: s_sub_i32 s1, s0, s4 +; GFX9-NEXT: s_cmp_ge_u32 s0, s4 +; GFX9-NEXT: s_cselect_b32 s0, s1, s0 +; GFX9-NEXT: s_abs_i32 s1, s10 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 +; GFX9-NEXT: s_xor_b32 s0, s0, s8 +; GFX9-NEXT: s_ashr_i32 s4, s6, 31 +; GFX9-NEXT: s_abs_i32 s5, s6 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_ashr_i32 s5, s6, 31 -; GFX9-NEXT: s_abs_i32 s6, s6 +; GFX9-NEXT: s_sub_i32 s6, 0, s1 +; GFX9-NEXT: s_sub_i32 s8, s0, s8 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s8, v0 -; GFX9-NEXT: s_mul_i32 s9, s9, s8 -; GFX9-NEXT: s_mul_hi_u32 s9, s8, s9 -; GFX9-NEXT: s_add_i32 s8, s8, s9 -; GFX9-NEXT: s_mul_hi_u32 s8, s6, s8 -; GFX9-NEXT: s_mul_i32 s8, s8, s4 -; GFX9-NEXT: s_sub_i32 s6, s6, s8 -; GFX9-NEXT: s_sub_i32 s8, s6, s4 -; GFX9-NEXT: s_cmp_ge_u32 s6, s4 -; GFX9-NEXT: s_cselect_b32 s6, s8, s6 -; GFX9-NEXT: s_sub_i32 s8, s6, s4 -; GFX9-NEXT: s_cmp_ge_u32 s6, s4 -; GFX9-NEXT: s_cselect_b32 s4, s8, s6 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: s_mul_i32 s6, s6, s0 +; GFX9-NEXT: s_mul_hi_u32 s6, s0, s6 +; GFX9-NEXT: s_add_i32 s0, s0, s6 +; GFX9-NEXT: s_mul_hi_u32 s0, s5, s0 +; GFX9-NEXT: s_mul_i32 s0, s0, s1 +; GFX9-NEXT: s_sub_i32 s0, s5, s0 +; GFX9-NEXT: s_sub_i32 s5, s0, s1 +; GFX9-NEXT: s_cmp_ge_u32 s0, s1 +; GFX9-NEXT: s_cselect_b32 s0, s5, s0 +; GFX9-NEXT: s_sub_i32 s5, s0, s1 +; GFX9-NEXT: s_cmp_ge_u32 s0, s1 +; GFX9-NEXT: s_cselect_b32 s5, s5, s0 ; GFX9-NEXT: s_abs_i32 s6, s11 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 -; GFX9-NEXT: s_xor_b32 s4, s4, s5 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: s_xor_b32 s5, s5, s4 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_ashr_i32 s2, s7, 31 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v1 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: s_abs_i32 s3, s7 ; GFX9-NEXT: s_sub_i32 s7, 0, s6 +; GFX9-NEXT: s_sub_i32 s4, s5, s4 ; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: s_sub_i32 s4, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-NEXT: v_readfirstlane_b32 s5, v2 ; GFX9-NEXT: s_mul_i32 s7, s7, s5 ; GFX9-NEXT: s_mul_hi_u32 s7, s5, s7 @@ -2448,6 +2451,7 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX9-NEXT: s_sub_i32 s2, s3, s2 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm %r = srem <4 x i32> %x, %y @@ -2542,8 +2546,8 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX6-LABEL: udiv_v4i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -2602,21 +2606,21 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX9-LABEL: udiv_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s3, s6, 0xffff -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX9-NEXT: s_and_b32 s2, s4, 0xffff +; GFX9-NEXT: s_and_b32 s1, s6, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 +; GFX9-NEXT: s_and_b32 s0, s4, 0xffff ; GFX9-NEXT: s_lshr_b32 s6, s6, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 ; GFX9-NEXT: s_lshr_b32 s4, s4, 16 ; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s4 ; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 ; GFX9-NEXT: s_and_b32 s2, s7, 0xffff ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v4 @@ -2654,6 +2658,7 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %r = udiv <4 x i16> %x, %y @@ -2756,8 +2761,8 @@ define amdgpu_kernel void @urem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX6-LABEL: urem_v4i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -2824,35 +2829,34 @@ define amdgpu_kernel void @urem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX9-LABEL: urem_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s3, s6, 0xffff -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX9-NEXT: s_and_b32 s2, s4, 0xffff -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s2 +; GFX9-NEXT: s_and_b32 s9, s6, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s9 +; GFX9-NEXT: s_and_b32 s8, s4, 0xffff ; GFX9-NEXT: s_lshr_b32 s6, s6, 16 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s8 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 ; GFX9-NEXT: s_lshr_b32 s4, s4, 16 ; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s4 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 ; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 +; GFX9-NEXT: s_and_b32 s2, s7, 0xffff ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v4 ; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 -; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v7, vcc -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 -; GFX9-NEXT: s_and_b32 s3, s7, 0xffff -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s2 ; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 ; GFX9-NEXT: v_trunc_f32_e32 v2, v5 -; GFX9-NEXT: s_and_b32 s8, s5, 0xffff +; GFX9-NEXT: s_and_b32 s3, s5, 0xffff +; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v7, vcc ; GFX9-NEXT: v_mad_f32 v3, -v2, v1, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s8 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v4 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc @@ -2867,20 +2871,21 @@ define amdgpu_kernel void @urem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v5 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 -; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s9 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc ; GFX9-NEXT: v_mul_f32_e32 v3, v7, v8 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v3 ; GFX9-NEXT: v_mad_f32 v3, -v3, v5, v7 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 -; GFX9-NEXT: v_mul_lo_u32 v2, v2, s3 +; GFX9-NEXT: v_mul_lo_u32 v2, v2, s2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc ; GFX9-NEXT: v_mul_lo_u32 v3, v3, s6 +; GFX9-NEXT: v_sub_u32_e32 v0, s8, v0 ; GFX9-NEXT: v_sub_u32_e32 v4, s4, v1 -; GFX9-NEXT: v_sub_u32_e32 v1, s8, v2 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_sub_u32_e32 v1, s3, v2 ; GFX9-NEXT: v_sub_u32_e32 v2, s5, v3 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v0 @@ -2994,8 +2999,8 @@ define amdgpu_kernel void @sdiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX6-LABEL: sdiv_v4i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -3074,79 +3079,79 @@ define amdgpu_kernel void @sdiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX9-LABEL: sdiv_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sext_i32_i16 s0, s6 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 -; GFX9-NEXT: s_sext_i32_i16 s1, s4 -; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s1 -; GFX9-NEXT: s_xor_b32 s0, s1, s0 +; GFX9-NEXT: s_sext_i32_i16 s2, s6 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GFX9-NEXT: s_sext_i32_i16 s3, s4 +; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s3 +; GFX9-NEXT: s_xor_b32 s2, s3, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 -; GFX9-NEXT: s_ashr_i32 s0, s0, 30 -; GFX9-NEXT: s_or_b32 s8, s0, 1 +; GFX9-NEXT: s_ashr_i32 s2, s2, 30 +; GFX9-NEXT: s_or_b32 s8, s2, 1 ; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 ; GFX9-NEXT: v_mad_f32 v1, -v3, v0, v1 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| -; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX9-NEXT: s_cselect_b32 s0, s8, 0 -; GFX9-NEXT: s_ashr_i32 s1, s6, 16 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s1 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_cselect_b32 s2, s8, 0 +; GFX9-NEXT: s_ashr_i32 s3, s6, 16 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s3 ; GFX9-NEXT: s_ashr_i32 s4, s4, 16 ; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s4 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 -; GFX9-NEXT: v_add_u32_e32 v3, s0, v3 +; GFX9-NEXT: v_add_u32_e32 v3, s2, v3 ; GFX9-NEXT: v_mul_f32_e32 v4, v1, v4 -; GFX9-NEXT: s_xor_b32 s0, s4, s1 +; GFX9-NEXT: s_xor_b32 s2, s4, s3 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 -; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: s_ashr_i32 s2, s2, 30 ; GFX9-NEXT: v_mad_f32 v1, -v4, v0, v1 -; GFX9-NEXT: s_or_b32 s4, s0, 1 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| -; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_or_b32 s4, s2, 1 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec ; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GFX9-NEXT: s_sext_i32_i16 s1, s7 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s1 -; GFX9-NEXT: s_cselect_b32 s0, s4, 0 -; GFX9-NEXT: v_add_u32_e32 v4, s0, v4 -; GFX9-NEXT: s_sext_i32_i16 s0, s5 -; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s0 +; GFX9-NEXT: s_sext_i32_i16 s3, s7 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s3 +; GFX9-NEXT: s_cselect_b32 s2, s4, 0 +; GFX9-NEXT: v_add_u32_e32 v4, s2, v4 +; GFX9-NEXT: s_sext_i32_i16 s2, s5 +; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v0 -; GFX9-NEXT: s_xor_b32 s0, s0, s1 -; GFX9-NEXT: s_ashr_i32 s0, s0, 30 -; GFX9-NEXT: s_or_b32 s4, s0, 1 +; GFX9-NEXT: s_xor_b32 s2, s2, s3 +; GFX9-NEXT: s_ashr_i32 s2, s2, 30 +; GFX9-NEXT: s_or_b32 s4, s2, 1 ; GFX9-NEXT: v_mul_f32_e32 v5, v1, v5 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 ; GFX9-NEXT: v_mad_f32 v1, -v5, v0, v1 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| -; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec ; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GFX9-NEXT: s_cselect_b32 s0, s4, 0 -; GFX9-NEXT: s_ashr_i32 s1, s7, 16 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s1 -; GFX9-NEXT: v_add_u32_e32 v1, s0, v5 -; GFX9-NEXT: s_ashr_i32 s0, s5, 16 -; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s0 +; GFX9-NEXT: s_cselect_b32 s2, s4, 0 +; GFX9-NEXT: s_ashr_i32 s3, s7, 16 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s3 +; GFX9-NEXT: v_add_u32_e32 v1, s2, v5 +; GFX9-NEXT: s_ashr_i32 s2, s5, 16 +; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v0 -; GFX9-NEXT: s_xor_b32 s0, s0, s1 -; GFX9-NEXT: s_ashr_i32 s0, s0, 30 -; GFX9-NEXT: s_or_b32 s4, s0, 1 +; GFX9-NEXT: s_xor_b32 s2, s2, s3 +; GFX9-NEXT: s_ashr_i32 s2, s2, 30 +; GFX9-NEXT: s_or_b32 s4, s2, 1 ; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6 ; GFX9-NEXT: v_trunc_f32_e32 v6, v6 ; GFX9-NEXT: v_mad_f32 v5, -v6, v0, v5 ; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v0| -; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX9-NEXT: s_cselect_b32 s0, s4, 0 -; GFX9-NEXT: v_add_u32_e32 v0, s0, v6 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v5|, |v0| +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_cselect_b32 s2, s4, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s2, v6 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v1, v0, 16, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v0 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %r = sdiv <4 x i16> %x, %y store <4 x i16> %r, ptr addrspace(1) %out @@ -3264,8 +3269,8 @@ define amdgpu_kernel void @srem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX6-LABEL: srem_v4i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -3356,78 +3361,78 @@ define amdgpu_kernel void @srem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX9-LABEL: srem_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sext_i32_i16 s8, s6 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s8 ; GFX9-NEXT: s_sext_i32_i16 s9, s4 ; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s9 -; GFX9-NEXT: s_xor_b32 s0, s9, s8 +; GFX9-NEXT: s_xor_b32 s2, s9, s8 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 -; GFX9-NEXT: s_ashr_i32 s0, s0, 30 -; GFX9-NEXT: s_or_b32 s10, s0, 1 +; GFX9-NEXT: s_ashr_i32 s2, s2, 30 +; GFX9-NEXT: s_or_b32 s10, s2, 1 ; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 ; GFX9-NEXT: v_mad_f32 v1, -v3, v0, v1 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| -; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX9-NEXT: s_cselect_b32 s0, s10, 0 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_cselect_b32 s2, s10, 0 ; GFX9-NEXT: s_ashr_i32 s6, s6, 16 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s6 ; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: v_add_u32_e32 v1, s0, v3 +; GFX9-NEXT: v_add_u32_e32 v1, s2, v3 ; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 -; GFX9-NEXT: s_xor_b32 s0, s4, s6 -; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: s_xor_b32 s2, s4, s6 +; GFX9-NEXT: s_ashr_i32 s2, s2, 30 ; GFX9-NEXT: v_mul_lo_u32 v1, v1, s8 ; GFX9-NEXT: v_mul_f32_e32 v4, v3, v4 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 ; GFX9-NEXT: v_mad_f32 v3, -v4, v0, v3 -; GFX9-NEXT: s_or_b32 s8, s0, 1 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v0| +; GFX9-NEXT: s_or_b32 s8, s2, 1 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v3|, |v0| ; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX9-NEXT: s_cselect_b32 s0, s8, 0 +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_cselect_b32 s2, s8, 0 ; GFX9-NEXT: s_sext_i32_i16 s8, s7 ; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s8 -; GFX9-NEXT: v_add_u32_e32 v0, s0, v4 +; GFX9-NEXT: v_add_u32_e32 v0, s2, v4 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, s6 ; GFX9-NEXT: s_sext_i32_i16 s6, s5 ; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s6 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v3 -; GFX9-NEXT: s_xor_b32 s0, s6, s8 -; GFX9-NEXT: s_ashr_i32 s0, s0, 30 -; GFX9-NEXT: s_or_b32 s10, s0, 1 +; GFX9-NEXT: s_xor_b32 s2, s6, s8 +; GFX9-NEXT: s_ashr_i32 s2, s2, 30 +; GFX9-NEXT: s_or_b32 s10, s2, 1 ; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 ; GFX9-NEXT: v_mad_f32 v4, -v5, v3, v4 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v3| -; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX9-NEXT: s_cselect_b32 s0, s10, 0 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v4|, |v3| +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_cselect_b32 s2, s10, 0 ; GFX9-NEXT: s_ashr_i32 s7, s7, 16 ; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 ; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s7 ; GFX9-NEXT: s_ashr_i32 s5, s5, 16 ; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 -; GFX9-NEXT: v_add_u32_e32 v3, s0, v5 +; GFX9-NEXT: v_add_u32_e32 v3, s2, v5 ; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s5 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v4 -; GFX9-NEXT: s_xor_b32 s0, s5, s7 -; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: s_xor_b32 s2, s5, s7 +; GFX9-NEXT: s_ashr_i32 s2, s2, 30 ; GFX9-NEXT: v_mul_lo_u32 v3, v3, s8 ; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6 ; GFX9-NEXT: v_trunc_f32_e32 v6, v6 ; GFX9-NEXT: v_mad_f32 v5, -v6, v4, v5 ; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 -; GFX9-NEXT: s_or_b32 s8, s0, 1 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v4| -; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX9-NEXT: s_cselect_b32 s0, s8, 0 -; GFX9-NEXT: v_add_u32_e32 v4, s0, v6 +; GFX9-NEXT: s_or_b32 s8, s2, 1 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v5|, |v4| +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_cselect_b32 s2, s8, 0 +; GFX9-NEXT: v_add_u32_e32 v4, s2, v6 ; GFX9-NEXT: v_mul_lo_u32 v4, v4, s7 ; GFX9-NEXT: v_sub_u32_e32 v5, s9, v1 ; GFX9-NEXT: v_sub_u32_e32 v1, s6, v3 @@ -3436,7 +3441,7 @@ define amdgpu_kernel void @srem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v5 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %r = srem <4 x i16> %x, %y store <4 x i16> %r, ptr addrspace(1) %out @@ -3467,8 +3472,8 @@ define amdgpu_kernel void @udiv_i3(ptr addrspace(1) %out, i3 %x, i3 %y) { ; ; GFX6-LABEL: udiv_i3: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_bfe_u32 s2, s4, 0x30008 @@ -3489,15 +3494,15 @@ define amdgpu_kernel void @udiv_i3(ptr addrspace(1) %out, i3 %x, i3 %y) { ; ; GFX9-LABEL: udiv_i3: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s0, s4, 0x30008 -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s0 +; GFX9-NEXT: s_bfe_u32 s2, s4, 0x30008 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 -; GFX9-NEXT: s_and_b32 s0, s4, 7 -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, s0 +; GFX9-NEXT: s_and_b32 s2, s4, 7 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, s2 ; GFX9-NEXT: v_mul_f32_e32 v1, v3, v1 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v1 @@ -3505,7 +3510,7 @@ define amdgpu_kernel void @udiv_i3(ptr addrspace(1) %out, i3 %x, i3 %y) { ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 -; GFX9-NEXT: global_store_byte v2, v0, s[2:3] +; GFX9-NEXT: global_store_byte v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm %r = udiv i3 %x, %y store i3 %r, ptr addrspace(1) %out @@ -3538,8 +3543,8 @@ define amdgpu_kernel void @urem_i3(ptr addrspace(1) %out, i3 %x, i3 %y) { ; ; GFX6-LABEL: urem_i3: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_bfe_u32 s2, s4, 0x30008 ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 @@ -3563,24 +3568,24 @@ define amdgpu_kernel void @urem_i3(ptr addrspace(1) %out, i3 %x, i3 %y) { ; ; GFX9-LABEL: urem_i3: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s3, s2, 0x30008 -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX9-NEXT: s_bfe_u32 s0, s4, 0x30008 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 -; GFX9-NEXT: s_and_b32 s4, s2, 7 -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, s4 -; GFX9-NEXT: s_lshr_b32 s3, s2, 8 +; GFX9-NEXT: s_and_b32 s1, s4, 7 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, s1 +; GFX9-NEXT: s_lshr_b32 s0, s4, 8 ; GFX9-NEXT: v_mul_f32_e32 v1, v2, v1 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v1 ; GFX9-NEXT: v_mad_f32 v1, -v1, v0, v2 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_byte v1, v0, s[0:1] @@ -3618,8 +3623,8 @@ define amdgpu_kernel void @sdiv_i3(ptr addrspace(1) %out, i3 %x, i3 %y) { ; ; GFX6-LABEL: sdiv_i3: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -3645,28 +3650,28 @@ define amdgpu_kernel void @sdiv_i3(ptr addrspace(1) %out, i3 %x, i3 %y) { ; ; GFX9-LABEL: sdiv_i3: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s0, s4, 0x30008 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 -; GFX9-NEXT: s_bfe_i32 s1, s4, 0x30000 -; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s1 -; GFX9-NEXT: s_xor_b32 s0, s1, s0 +; GFX9-NEXT: s_bfe_i32 s2, s4, 0x30008 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GFX9-NEXT: s_bfe_i32 s3, s4, 0x30000 +; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s3 +; GFX9-NEXT: s_xor_b32 s2, s3, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 -; GFX9-NEXT: s_ashr_i32 s0, s0, 30 -; GFX9-NEXT: s_or_b32 s4, s0, 1 +; GFX9-NEXT: s_ashr_i32 s2, s2, 30 +; GFX9-NEXT: s_or_b32 s4, s2, 1 ; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 ; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| -; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX9-NEXT: s_cselect_b32 s0, s4, 0 -; GFX9-NEXT: v_add_u32_e32 v0, s0, v3 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v2|, |v0| +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_cselect_b32 s2, s4, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s2, v3 ; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 -; GFX9-NEXT: global_store_byte v1, v0, s[2:3] +; GFX9-NEXT: global_store_byte v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm %r = sdiv i3 %x, %y store i3 %r, ptr addrspace(1) %out @@ -3703,8 +3708,8 @@ define amdgpu_kernel void @srem_i3(ptr addrspace(1) %out, i3 %x, i3 %y) { ; ; GFX6-LABEL: srem_i3: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_bfe_i32 s2, s4, 0x30008 ; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s2 @@ -3733,27 +3738,27 @@ define amdgpu_kernel void @srem_i3(ptr addrspace(1) %out, i3 %x, i3 %y) { ; ; GFX9-LABEL: srem_i3: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s2, s4, 0x30008 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 -; GFX9-NEXT: s_bfe_i32 s3, s4, 0x30000 -; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s3 -; GFX9-NEXT: s_xor_b32 s2, s3, s2 +; GFX9-NEXT: s_bfe_i32 s0, s4, 0x30008 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX9-NEXT: s_bfe_i32 s1, s4, 0x30000 +; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s1 +; GFX9-NEXT: s_xor_b32 s0, s1, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 ; GFX9-NEXT: s_lshr_b32 s5, s4, 8 -; GFX9-NEXT: s_or_b32 s6, s2, 1 +; GFX9-NEXT: s_or_b32 s6, s0, 1 ; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2 ; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX9-NEXT: s_cselect_b32 s2, s6, 0 -; GFX9-NEXT: v_add_u32_e32 v0, s2, v2 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cselect_b32 s0, s6, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s0, v2 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 @@ -3832,8 +3837,8 @@ define amdgpu_kernel void @udiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX6-LABEL: udiv_v3i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -3879,21 +3884,21 @@ define amdgpu_kernel void @udiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX9-LABEL: udiv_v3i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s3, s6, 0xffff -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX9-NEXT: s_and_b32 s2, s4, 0xffff +; GFX9-NEXT: s_and_b32 s1, s6, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 +; GFX9-NEXT: s_and_b32 s0, s4, 0xffff ; GFX9-NEXT: s_lshr_b32 s6, s6, 16 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 ; GFX9-NEXT: s_lshr_b32 s4, s4, 16 ; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 ; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 ; GFX9-NEXT: s_and_b32 s2, s7, 0xffff ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v4 @@ -3918,6 +3923,7 @@ define amdgpu_kernel void @udiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v3, vcc ; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v6, v2, s[0:1] offset:4 ; GFX9-NEXT: global_store_dword v6, v0, s[0:1] ; GFX9-NEXT: s_endpgm @@ -3999,8 +4005,8 @@ define amdgpu_kernel void @urem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX6-LABEL: urem_v3i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -4052,33 +4058,33 @@ define amdgpu_kernel void @urem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX9-LABEL: urem_v3i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s3, s6, 0xffff -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX9-NEXT: s_and_b32 s2, s4, 0xffff -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s2 +; GFX9-NEXT: s_and_b32 s9, s6, 0xffff ; GFX9-NEXT: s_lshr_b32 s6, s6, 16 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s9 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 +; GFX9-NEXT: s_and_b32 s8, s4, 0xffff ; GFX9-NEXT: s_lshr_b32 s4, s4, 16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s8 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 ; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s4 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 +; GFX9-NEXT: s_and_b32 s2, s7, 0xffff ; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 -; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v4 -; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 ; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v6, vcc ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 -; GFX9-NEXT: s_and_b32 s3, s7, 0xffff +; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2 +; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v4 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 ; GFX9-NEXT: v_mad_f32 v2, -v5, v1, v3 -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s3 -; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s2 +; GFX9-NEXT: s_and_b32 s3, s5, 0xffff +; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v6, vcc ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v5 -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s5 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v3 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v4, vcc @@ -4087,17 +4093,16 @@ define amdgpu_kernel void @urem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2 ; GFX9-NEXT: v_mad_f32 v2, -v2, v3, v5 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s9 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v4, vcc ; GFX9-NEXT: v_mul_lo_u32 v1, v1, s6 -; GFX9-NEXT: v_mul_lo_u32 v2, v2, s3 -; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 +; GFX9-NEXT: v_mul_lo_u32 v2, v2, s2 +; GFX9-NEXT: v_sub_u32_e32 v0, s8, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: v_sub_u32_e32 v1, s4, v1 -; GFX9-NEXT: v_sub_u32_e32 v2, s5, v2 +; GFX9-NEXT: v_sub_u32_e32 v2, s3, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v3, v2, s[0:1] offset:4 ; GFX9-NEXT: global_store_dword v3, v0, s[0:1] ; GFX9-NEXT: s_endpgm @@ -4185,8 +4190,8 @@ define amdgpu_kernel void @sdiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX6-LABEL: sdiv_v3i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -4247,62 +4252,62 @@ define amdgpu_kernel void @sdiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX9-LABEL: sdiv_v3i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sext_i32_i16 s0, s6 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 -; GFX9-NEXT: s_sext_i32_i16 s1, s4 -; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s1 -; GFX9-NEXT: s_xor_b32 s0, s1, s0 +; GFX9-NEXT: s_sext_i32_i16 s2, s6 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GFX9-NEXT: s_sext_i32_i16 s3, s4 +; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s3 +; GFX9-NEXT: s_xor_b32 s2, s3, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 -; GFX9-NEXT: s_ashr_i32 s0, s0, 30 -; GFX9-NEXT: s_or_b32 s8, s0, 1 +; GFX9-NEXT: s_ashr_i32 s2, s2, 30 +; GFX9-NEXT: s_or_b32 s8, s2, 1 ; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 ; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| -; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX9-NEXT: s_cselect_b32 s0, s8, 0 -; GFX9-NEXT: s_ashr_i32 s1, s6, 16 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v2|, |v0| +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_cselect_b32 s2, s8, 0 +; GFX9-NEXT: s_ashr_i32 s3, s6, 16 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s1 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s3 ; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: v_add_u32_e32 v2, s0, v3 +; GFX9-NEXT: v_add_u32_e32 v2, s2, v3 ; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 -; GFX9-NEXT: s_xor_b32 s0, s4, s1 -; GFX9-NEXT: s_ashr_i32 s0, s0, 30 -; GFX9-NEXT: s_or_b32 s4, s0, 1 +; GFX9-NEXT: s_xor_b32 s2, s4, s3 +; GFX9-NEXT: s_ashr_i32 s2, s2, 30 +; GFX9-NEXT: s_or_b32 s4, s2, 1 ; GFX9-NEXT: v_mul_f32_e32 v4, v3, v4 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 ; GFX9-NEXT: v_mad_f32 v3, -v4, v0, v3 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v0| -; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v3|, |v0| +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec ; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GFX9-NEXT: s_sext_i32_i16 s1, s7 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s1 -; GFX9-NEXT: s_cselect_b32 s0, s4, 0 -; GFX9-NEXT: v_add_u32_e32 v3, s0, v4 -; GFX9-NEXT: s_sext_i32_i16 s0, s5 -; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s0 +; GFX9-NEXT: s_sext_i32_i16 s3, s7 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s3 +; GFX9-NEXT: s_cselect_b32 s2, s4, 0 +; GFX9-NEXT: v_add_u32_e32 v3, s2, v4 +; GFX9-NEXT: s_sext_i32_i16 s2, s5 +; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v0 -; GFX9-NEXT: s_xor_b32 s0, s0, s1 -; GFX9-NEXT: s_ashr_i32 s0, s0, 30 -; GFX9-NEXT: s_or_b32 s4, s0, 1 +; GFX9-NEXT: s_xor_b32 s2, s2, s3 +; GFX9-NEXT: s_ashr_i32 s2, s2, 30 +; GFX9-NEXT: s_or_b32 s4, s2, 1 ; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 ; GFX9-NEXT: v_mad_f32 v4, -v5, v0, v4 ; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v0| -; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX9-NEXT: s_cselect_b32 s0, s4, 0 -; GFX9-NEXT: v_add_u32_e32 v0, s0, v5 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v4|, |v0| +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_cselect_b32 s2, s4, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s2, v5 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX9-NEXT: global_store_short v1, v0, s[2:3] offset:4 -; GFX9-NEXT: global_store_dword v1, v2, s[2:3] +; GFX9-NEXT: global_store_short v1, v0, s[0:1] offset:4 +; GFX9-NEXT: global_store_dword v1, v2, s[0:1] ; GFX9-NEXT: s_endpgm %r = sdiv <3 x i16> %x, %y store <3 x i16> %r, ptr addrspace(1) %out @@ -4394,8 +4399,8 @@ define amdgpu_kernel void @srem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX6-LABEL: srem_v3i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -4464,7 +4469,8 @@ define amdgpu_kernel void @srem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX9-LABEL: srem_v3i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sext_i32_i16 s8, s6 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s8 @@ -4516,7 +4522,6 @@ define amdgpu_kernel void @srem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec ; GFX9-NEXT: s_cselect_b32 s2, s6, 0 ; GFX9-NEXT: v_add_u32_e32 v2, s2, v4 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mul_lo_u32 v2, v2, s7 ; GFX9-NEXT: v_sub_u32_e32 v1, s9, v1 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 @@ -4524,7 +4529,6 @@ define amdgpu_kernel void @srem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; GFX9-NEXT: v_sub_u32_e32 v2, s5, v2 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v3, v2, s[0:1] offset:4 ; GFX9-NEXT: global_store_dword v3, v0, s[0:1] ; GFX9-NEXT: s_endpgm @@ -4600,33 +4604,31 @@ define amdgpu_kernel void @udiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; ; GFX6-LABEL: udiv_v3i15: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: s_and_b32 s5, s8, 0x7fff -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s5 -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_and_b32 s4, s6, 0x7fff -; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s4 -; GFX6-NEXT: s_bfe_u32 s4, s8, 0xf000f +; GFX6-NEXT: s_and_b32 s2, s10, 0x7fff +; GFX6-NEXT: s_and_b32 s3, s0, 0x7fff +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s3 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: s_bfe_u32 s0, s0, 0xf000f +; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s2 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s4 -; GFX6-NEXT: s_bfe_u32 s5, s6, 0xf000f -; GFX6-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NEXT: v_alignbit_b32 v2, s9, v2, 30 +; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s0 +; GFX6-NEXT: s_bfe_u32 s3, s10, 0xf000f +; GFX6-NEXT: v_alignbit_b32 v2, s1, v2, 30 ; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 -; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s5 +; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s3 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v5 ; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4 ; GFX6-NEXT: v_mad_f32 v3, -v4, v1, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, v2 -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_alignbit_b32 v0, s7, v0, 30 +; GFX6-NEXT: v_mov_b32_e32 v0, s10 +; GFX6-NEXT: v_alignbit_b32 v0, s11, v0, 30 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 ; GFX6-NEXT: v_mul_f32_e32 v1, v6, v7 ; GFX6-NEXT: v_and_b32_e32 v0, 0x7fff, v0 @@ -4649,31 +4651,33 @@ define amdgpu_kernel void @udiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX6-NEXT: s_mov_b32 s4, s8 +; GFX6-NEXT: s_mov_b32 s5, s9 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 -; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 +; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: udiv_v3i15: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, s6, 0x7fff -; GFX9-NEXT: s_and_b32 s1, s2, 0x7fff -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s1 -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s0 -; GFX9-NEXT: s_bfe_u32 s0, s2, 0xf000f -; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s0 +; GFX9-NEXT: s_and_b32 s2, s6, 0x7fff +; GFX9-NEXT: s_and_b32 s3, s0, 0x7fff +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0xf000f +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 -; GFX9-NEXT: s_bfe_u32 s1, s6, 0xf000f -; GFX9-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NEXT: v_alignbit_b32 v3, s3, v3, 30 +; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s0 +; GFX9-NEXT: s_bfe_u32 s3, s6, 0xf000f +; GFX9-NEXT: v_alignbit_b32 v3, s1, v3, 30 ; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 -; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s1 +; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v6 ; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v3 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 @@ -4787,41 +4791,41 @@ define amdgpu_kernel void @urem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; ; GFX6-LABEL: urem_v3i15: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_alignbit_b32 v0, s7, v0, 30 -; GFX6-NEXT: s_and_b32 s7, s8, 0x7fff -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s7 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: s_and_b32 s5, s6, 0x7fff -; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s5 +; GFX6-NEXT: s_mov_b32 s4, s8 +; GFX6-NEXT: s_and_b32 s8, s0, 0x7fff +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s8 +; GFX6-NEXT: s_and_b32 s3, s10, 0x7fff +; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s3 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v1 -; GFX6-NEXT: s_bfe_u32 s5, s8, 0xf000f -; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s5 -; GFX6-NEXT: s_bfe_u32 s7, s6, 0xf000f +; GFX6-NEXT: v_alignbit_b32 v2, s1, v2, 30 +; GFX6-NEXT: s_bfe_u32 s1, s0, 0xf000f +; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s1 ; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4 ; GFX6-NEXT: v_mad_f32 v3, -v4, v1, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 -; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s7 -; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: s_bfe_u32 s8, s10, 0xf000f +; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s8 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc -; GFX6-NEXT: v_mul_lo_u32 v1, v1, s8 +; GFX6-NEXT: v_mul_lo_u32 v1, v1, s0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v5 -; GFX6-NEXT: v_alignbit_b32 v2, s9, v2, 30 ; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s6, v1 +; GFX6-NEXT: v_mov_b32_e32 v0, s10 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s10, v1 ; GFX6-NEXT: v_mul_f32_e32 v1, v3, v4 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, v2 +; GFX6-NEXT: v_alignbit_b32 v0, s11, v0, 30 ; GFX6-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX6-NEXT: v_cvt_f32_u32_e32 v7, v0 -; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v4 +; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_mad_f32 v3, -v1, v5, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 @@ -4830,32 +4834,32 @@ define amdgpu_kernel void @urem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v3 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX6-NEXT: v_mad_f32 v3, -v3, v4, v7 -; GFX6-NEXT: s_lshr_b32 s5, s8, 15 +; GFX6-NEXT: s_lshr_b32 s0, s0, 15 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 -; GFX6-NEXT: v_mul_lo_u32 v1, v1, s5 +; GFX6-NEXT: v_mul_lo_u32 v1, v1, s0 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc ; GFX6-NEXT: v_mul_lo_u32 v2, v3, v2 -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_lshr_b32 s4, s6, 15 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s4, v1 +; GFX6-NEXT: s_lshr_b32 s2, s10, 15 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s2, v1 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v3 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 ; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX6-NEXT: s_mov_b32 s5, s9 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 -; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 +; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: urem_v3i15: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_alignbit_b32 v0, s7, v0, 30 @@ -4996,52 +5000,50 @@ define amdgpu_kernel void @sdiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; ; GFX6-LABEL: sdiv_v3i15: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_bfe_i32 s4, s8, 0xf0000 -; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: s_bfe_i32 s5, s6, 0xf0000 -; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s10 +; GFX6-NEXT: s_bfe_i32 s2, s0, 0xf0000 +; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: v_alignbit_b32 v1, s1, v1, 30 +; GFX6-NEXT: s_bfe_i32 s1, s10, 0xf0000 +; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 -; GFX6-NEXT: s_xor_b32 s4, s5, s4 -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: s_ashr_i32 s4, s4, 30 +; GFX6-NEXT: s_xor_b32 s1, s1, s2 +; GFX6-NEXT: s_ashr_i32 s1, s1, 30 +; GFX6-NEXT: s_or_b32 s1, s1, 1 ; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4 ; GFX6-NEXT: v_mad_f32 v3, -v4, v2, v3 -; GFX6-NEXT: v_alignbit_b32 v0, s7, v0, 30 -; GFX6-NEXT: s_or_b32 s7, s4, 1 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, |v2| -; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX6-NEXT: v_cmp_ge_f32_e64 s[2:3], |v3|, |v2| +; GFX6-NEXT: s_and_b64 s[2:3], s[2:3], exec ; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GFX6-NEXT: s_cselect_b32 s4, s7, 0 -; GFX6-NEXT: s_bfe_i32 s5, s8, 0xf000f -; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s5 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, s4, v4 -; GFX6-NEXT: s_bfe_i32 s4, s6, 0xf000f -; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s4 +; GFX6-NEXT: s_cselect_b32 s1, s1, 0 +; GFX6-NEXT: s_bfe_i32 s0, s0, 0xf000f +; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s0 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, s1, v4 +; GFX6-NEXT: s_bfe_i32 s1, s10, 0xf000f +; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 -; GFX6-NEXT: v_mov_b32_e32 v1, s8 -; GFX6-NEXT: v_alignbit_b32 v1, s9, v1, 30 -; GFX6-NEXT: s_xor_b32 s4, s4, s5 +; GFX6-NEXT: s_xor_b32 s0, s1, s0 +; GFX6-NEXT: s_ashr_i32 s0, s0, 30 +; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 15 ; GFX6-NEXT: v_mul_f32_e32 v5, v4, v5 ; GFX6-NEXT: v_trunc_f32_e32 v5, v5 -; GFX6-NEXT: s_ashr_i32 s4, s4, 30 ; GFX6-NEXT: v_mad_f32 v4, -v5, v2, v4 -; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 15 -; GFX6-NEXT: s_or_b32 s6, s4, 1 +; GFX6-NEXT: s_or_b32 s2, s0, 1 ; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v4|, |v2| +; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v2| ; GFX6-NEXT: v_cvt_f32_i32_e32 v2, v1 -; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX6-NEXT: s_cselect_b32 s4, s6, 0 +; GFX6-NEXT: v_alignbit_b32 v0, s11, v0, 30 +; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX6-NEXT: s_cselect_b32 s0, s2, 0 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 15 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, s4, v5 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, s0, v5 ; GFX6-NEXT: v_cvt_f32_i32_e32 v5, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v2 ; GFX6-NEXT: v_xor_b32_e32 v0, v0, v1 @@ -5059,43 +5061,46 @@ define amdgpu_kernel void @sdiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX6-NEXT: s_mov_b32 s4, s8 +; GFX6-NEXT: s_mov_b32 s5, s9 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 -; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 +; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_v3i15: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: s_bfe_i32 s2, s0, 0xf0000 +; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_alignbit_b32 v1, s1, v1, 30 ; GFX9-NEXT: s_bfe_i32 s1, s6, 0xf0000 -; GFX9-NEXT: s_bfe_i32 s0, s2, 0xf0000 -; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s0 ; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s1 -; GFX9-NEXT: s_xor_b32 s0, s1, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v3 -; GFX9-NEXT: s_ashr_i32 s0, s0, 30 -; GFX9-NEXT: v_alignbit_b32 v1, s3, v1, 30 -; GFX9-NEXT: s_or_b32 s3, s0, 1 +; GFX9-NEXT: s_xor_b32 s1, s1, s2 +; GFX9-NEXT: s_ashr_i32 s1, s1, 30 +; GFX9-NEXT: s_or_b32 s1, s1, 1 ; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 ; GFX9-NEXT: v_mad_f32 v4, -v5, v3, v4 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v3| -; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v4|, |v3| +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec ; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GFX9-NEXT: s_cselect_b32 s0, s3, 0 -; GFX9-NEXT: s_bfe_i32 s1, s2, 0xf000f -; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s1 -; GFX9-NEXT: v_add_u32_e32 v4, s0, v5 -; GFX9-NEXT: s_bfe_i32 s0, s6, 0xf000f -; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s0 +; GFX9-NEXT: s_cselect_b32 s1, s1, 0 +; GFX9-NEXT: s_bfe_i32 s0, s0, 0xf000f +; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s0 +; GFX9-NEXT: v_add_u32_e32 v4, s1, v5 +; GFX9-NEXT: s_bfe_i32 s1, s6, 0xf000f +; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v3 -; GFX9-NEXT: s_xor_b32 s0, s0, s1 +; GFX9-NEXT: s_xor_b32 s0, s1, s0 ; GFX9-NEXT: s_ashr_i32 s0, s0, 30 ; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 15 ; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6 @@ -5105,7 +5110,6 @@ define amdgpu_kernel void @sdiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v3| ; GFX9-NEXT: v_cvt_f32_i32_e32 v3, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_alignbit_b32 v0, s7, v0, 30 ; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: s_cselect_b32 s0, s2, 0 @@ -5223,73 +5227,73 @@ define amdgpu_kernel void @srem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; ; GFX6-LABEL: srem_v3i15: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_bfe_i32 s4, s8, 0xf0000 -; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: s_bfe_i32 s5, s6, 0xf0000 -; GFX6-NEXT: v_cvt_f32_i32_e32 v5, s5 +; GFX6-NEXT: s_bfe_i32 s2, s10, 0xf0000 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: v_alignbit_b32 v2, s1, v2, 30 +; GFX6-NEXT: s_bfe_i32 s1, s0, 0xf0000 +; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s1 +; GFX6-NEXT: v_cvt_f32_i32_e32 v5, s2 +; GFX6-NEXT: s_xor_b32 s1, s2, s1 +; GFX6-NEXT: s_ashr_i32 s1, s1, 30 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 -; GFX6-NEXT: s_xor_b32 s4, s5, s4 -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: s_mov_b32 s4, s8 +; GFX6-NEXT: s_mov_b32 s5, s9 +; GFX6-NEXT: s_lshr_b32 s8, s10, 15 ; GFX6-NEXT: v_mul_f32_e32 v6, v5, v6 ; GFX6-NEXT: v_trunc_f32_e32 v6, v6 -; GFX6-NEXT: s_ashr_i32 s4, s4, 30 ; GFX6-NEXT: v_mad_f32 v5, -v6, v4, v5 ; GFX6-NEXT: v_cvt_i32_f32_e32 v6, v6 -; GFX6-NEXT: v_alignbit_b32 v0, s7, v0, 30 -; GFX6-NEXT: s_lshr_b32 s7, s6, 15 -; GFX6-NEXT: v_alignbit_b32 v2, s9, v2, 30 -; GFX6-NEXT: s_lshr_b32 s9, s8, 15 -; GFX6-NEXT: s_or_b32 s10, s4, 1 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, |v4| -; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX6-NEXT: s_cselect_b32 s4, s10, 0 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, s4, v6 -; GFX6-NEXT: s_bfe_i32 s4, s8, 0xf000f -; GFX6-NEXT: v_cvt_f32_i32_e32 v5, s4 -; GFX6-NEXT: s_bfe_i32 s5, s6, 0xf000f -; GFX6-NEXT: v_cvt_f32_i32_e32 v6, s5 -; GFX6-NEXT: v_mul_lo_u32 v4, v4, s8 +; GFX6-NEXT: s_lshr_b32 s9, s0, 15 +; GFX6-NEXT: s_or_b32 s1, s1, 1 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[2:3], |v5|, |v4| +; GFX6-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX6-NEXT: s_cselect_b32 s1, s1, 0 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, s1, v6 +; GFX6-NEXT: v_mul_lo_u32 v4, v4, s0 +; GFX6-NEXT: s_bfe_i32 s0, s0, 0xf000f +; GFX6-NEXT: v_cvt_f32_i32_e32 v5, s0 +; GFX6-NEXT: s_bfe_i32 s1, s10, 0xf000f +; GFX6-NEXT: v_cvt_f32_i32_e32 v6, s1 +; GFX6-NEXT: s_xor_b32 s0, s1, s0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v5 -; GFX6-NEXT: s_xor_b32 s4, s5, s4 ; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v2 -; GFX6-NEXT: s_ashr_i32 s4, s4, 30 +; GFX6-NEXT: s_ashr_i32 s0, s0, 30 +; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 15 ; GFX6-NEXT: v_mul_f32_e32 v7, v6, v7 ; GFX6-NEXT: v_trunc_f32_e32 v7, v7 ; GFX6-NEXT: v_mad_f32 v6, -v7, v5, v6 -; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 15 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s6, v4 -; GFX6-NEXT: s_or_b32 s6, s4, 1 +; GFX6-NEXT: s_or_b32 s2, s0, 1 ; GFX6-NEXT: v_cvt_i32_f32_e32 v7, v7 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v6|, |v5| +; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v6|, |v5| ; GFX6-NEXT: v_cvt_f32_i32_e32 v6, v2 -; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX6-NEXT: v_mov_b32_e32 v0, s10 +; GFX6-NEXT: v_alignbit_b32 v0, s11, v0, 30 +; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX6-NEXT: v_and_b32_e32 v1, 0x7fff, v0 -; GFX6-NEXT: s_cselect_b32 s4, s6, 0 +; GFX6-NEXT: s_cselect_b32 s0, s2, 0 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 15 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, s4, v7 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, s0, v7 ; GFX6-NEXT: v_cvt_f32_i32_e32 v7, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v6 ; GFX6-NEXT: v_xor_b32_e32 v0, v0, v2 -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 30, v0 -; GFX6-NEXT: v_or_b32_e32 v0, 1, v0 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s10, v4 ; GFX6-NEXT: v_mul_f32_e32 v2, v7, v8 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 ; GFX6-NEXT: v_mad_f32 v7, -v2, v6, v7 ; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GFX6-NEXT: v_ashrrev_i32_e32 v0, 30, v0 +; GFX6-NEXT: v_or_b32_e32 v0, 1, v0 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, |v6| ; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GFX6-NEXT: v_mul_lo_u32 v5, v5, s9 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, v3 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s7, v5 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s8, v5 ; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v1, v0 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 @@ -5297,54 +5301,54 @@ define amdgpu_kernel void @srem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 15, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 -; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 +; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: srem_v3i15: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s1, s6, 0xf0000 -; GFX9-NEXT: s_bfe_i32 s0, s2, 0xf0000 -; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s0 -; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s1 -; GFX9-NEXT: s_xor_b32 s0, s1, s0 +; GFX9-NEXT: s_bfe_i32 s2, s6, 0xf0000 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_alignbit_b32 v1, s1, v1, 30 +; GFX9-NEXT: s_bfe_i32 s1, s0, 0xf0000 +; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s1 +; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s2 +; GFX9-NEXT: s_xor_b32 s1, s2, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v4 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: s_ashr_i32 s1, s1, 30 ; GFX9-NEXT: s_lshr_b32 s8, s6, 15 +; GFX9-NEXT: v_alignbit_b32 v0, s7, v0, 30 ; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6 ; GFX9-NEXT: v_trunc_f32_e32 v6, v6 ; GFX9-NEXT: v_mad_f32 v5, -v6, v4, v5 ; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 -; GFX9-NEXT: v_alignbit_b32 v0, s7, v0, 30 -; GFX9-NEXT: v_alignbit_b32 v1, s3, v1, 30 -; GFX9-NEXT: s_lshr_b32 s3, s2, 15 -; GFX9-NEXT: s_or_b32 s7, s0, 1 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v4| -; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX9-NEXT: s_cselect_b32 s0, s7, 0 -; GFX9-NEXT: v_add_u32_e32 v4, s0, v6 -; GFX9-NEXT: s_bfe_i32 s0, s2, 0xf000f -; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s0 -; GFX9-NEXT: s_bfe_i32 s1, s6, 0xf000f -; GFX9-NEXT: v_cvt_f32_i32_e32 v6, s1 -; GFX9-NEXT: s_xor_b32 s0, s1, s0 +; GFX9-NEXT: s_lshr_b32 s7, s0, 15 +; GFX9-NEXT: s_or_b32 s1, s1, 1 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v5|, |v4| +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_cselect_b32 s1, s1, 0 +; GFX9-NEXT: v_add_u32_e32 v4, s1, v6 +; GFX9-NEXT: s_bfe_i32 s1, s0, 0xf000f +; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s1 +; GFX9-NEXT: v_mul_lo_u32 v4, v4, s0 +; GFX9-NEXT: s_bfe_i32 s0, s6, 0xf000f +; GFX9-NEXT: v_cvt_f32_i32_e32 v6, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v5 +; GFX9-NEXT: s_xor_b32 s0, s0, s1 ; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v1 ; GFX9-NEXT: s_ashr_i32 s0, s0, 30 -; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 15 ; GFX9-NEXT: v_mul_f32_e32 v7, v6, v7 ; GFX9-NEXT: v_trunc_f32_e32 v7, v7 ; GFX9-NEXT: v_mad_f32 v6, -v7, v5, v6 ; GFX9-NEXT: v_cvt_i32_f32_e32 v7, v7 -; GFX9-NEXT: v_mul_lo_u32 v4, v4, s2 +; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 15 ; GFX9-NEXT: s_or_b32 s2, s0, 1 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v6|, |v5| ; GFX9-NEXT: v_cvt_f32_i32_e32 v6, v1 @@ -5363,7 +5367,7 @@ define amdgpu_kernel void @srem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX9-NEXT: v_mad_f32 v7, -v7, v6, v8 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, |v6| ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX9-NEXT: v_mul_lo_u32 v5, v5, s3 +; GFX9-NEXT: v_mul_lo_u32 v5, v5, s7 ; GFX9-NEXT: v_add_u32_e32 v1, v9, v1 ; GFX9-NEXT: v_mul_lo_u32 v1, v1, v3 ; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0 @@ -5393,8 +5397,8 @@ define amdgpu_kernel void @udiv_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX6-LABEL: udiv_i32_oddk_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v0, 0xb2a50881 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -5409,17 +5413,17 @@ define amdgpu_kernel void @udiv_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX9-LABEL: udiv_i32_oddk_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_hi_u32 s0, s4, 0xb2a50881 -; GFX9-NEXT: s_sub_i32 s1, s4, s0 -; GFX9-NEXT: s_lshr_b32 s1, s1, 1 -; GFX9-NEXT: s_add_i32 s1, s1, s0 -; GFX9-NEXT: s_lshr_b32 s0, s1, 20 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: s_mul_hi_u32 s2, s4, 0xb2a50881 +; GFX9-NEXT: s_sub_i32 s3, s4, s2 +; GFX9-NEXT: s_lshr_b32 s3, s3, 1 +; GFX9-NEXT: s_add_i32 s3, s3, s2 +; GFX9-NEXT: s_lshr_b32 s2, s3, 20 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm %r = udiv i32 %x, 1235195 store i32 %r, ptr addrspace(1) %out @@ -5434,8 +5438,8 @@ define amdgpu_kernel void @udiv_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX6-LABEL: udiv_i32_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5446,13 +5450,13 @@ define amdgpu_kernel void @udiv_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX9-LABEL: udiv_i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s0, s4, 12 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: s_lshr_b32 s2, s4, 12 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm %r = udiv i32 %x, 4096 store i32 %r, ptr addrspace(1) %out @@ -5468,7 +5472,7 @@ define amdgpu_kernel void @udiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; ; GFX6-LABEL: udiv_i32_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5482,7 +5486,7 @@ define amdgpu_kernel void @udiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; ; GFX9-LABEL: udiv_i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_i32 s0, s7, 12 @@ -5509,7 +5513,7 @@ define amdgpu_kernel void @udiv_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3 ; ; GFX6-LABEL: udiv_v2i32_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5524,7 +5528,7 @@ define amdgpu_kernel void @udiv_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3 ; ; GFX9-LABEL: udiv_v2i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s0, s6, 12 @@ -5551,7 +5555,7 @@ define amdgpu_kernel void @udiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out, < ; ; GFX6-LABEL: udiv_v2i32_mixed_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v0, 0x100101 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 @@ -5570,7 +5574,7 @@ define amdgpu_kernel void @udiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out, < ; ; GFX9-LABEL: udiv_v2i32_mixed_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mul_hi_u32 s1, s7, 0x100101 @@ -5660,42 +5664,42 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX6-LABEL: udiv_v2i32_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s11, 0xf000 ; GFX6-NEXT: s_mov_b32 s10, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s6 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX6-NEXT: s_sub_i32 s3, 0, s2 +; GFX6-NEXT: s_lshl_b32 s0, 0x1000, s6 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX6-NEXT: s_sub_i32 s1, 0, s0 ; GFX6-NEXT: s_lshl_b32 s6, 0x1000, s7 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s6 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0 +; GFX6-NEXT: v_mul_lo_u32 v1, s1, v0 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_readfirstlane_b32 s3, v0 -; GFX6-NEXT: s_mul_i32 s3, s3, s2 -; GFX6-NEXT: s_sub_i32 s3, s4, s3 -; GFX6-NEXT: s_sub_i32 s4, s3, s2 -; GFX6-NEXT: s_cmp_ge_u32 s3, s2 +; GFX6-NEXT: v_readfirstlane_b32 s1, v0 +; GFX6-NEXT: s_mul_i32 s1, s1, s0 +; GFX6-NEXT: s_sub_i32 s1, s4, s1 +; GFX6-NEXT: s_sub_i32 s4, s1, s0 +; GFX6-NEXT: s_cmp_ge_u32 s1, s0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; GFX6-NEXT: s_cselect_b32 s3, s4, s3 +; GFX6-NEXT: s_cselect_b32 s1, s4, s1 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX6-NEXT: s_cmp_ge_u32 s3, s2 -; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s1, s0 +; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX6-NEXT: s_sub_i32 s4, 0, s6 ; GFX6-NEXT: v_mul_lo_u32 v3, s4, v1 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 ; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[2:3] +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 ; GFX6-NEXT: v_readfirstlane_b32 s0, v1 @@ -5716,54 +5720,54 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX9-LABEL: udiv_v2i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX9-NEXT: s_lshl_b32 s2, 0x1000, s7 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2 -; GFX9-NEXT: s_sub_i32 s6, 0, s3 +; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX9-NEXT: s_lshl_b32 s7, 0x1000, s7 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 +; GFX9-NEXT: s_sub_i32 s2, 0, s6 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_readfirstlane_b32 s7, v0 -; GFX9-NEXT: s_mul_i32 s6, s6, s7 -; GFX9-NEXT: s_mul_hi_u32 s6, s7, s6 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_mul_hi_u32 s6, s4, s7 -; GFX9-NEXT: s_mul_i32 s7, s6, s3 -; GFX9-NEXT: s_sub_i32 s4, s4, s7 -; GFX9-NEXT: s_add_i32 s9, s6, 1 -; GFX9-NEXT: s_sub_i32 s7, s4, s3 -; GFX9-NEXT: s_cmp_ge_u32 s4, s3 -; GFX9-NEXT: s_cselect_b32 s6, s9, s6 -; GFX9-NEXT: s_cselect_b32 s4, s7, s4 -; GFX9-NEXT: s_add_i32 s7, s6, 1 -; GFX9-NEXT: s_cmp_ge_u32 s4, s3 +; GFX9-NEXT: v_readfirstlane_b32 s3, v0 +; GFX9-NEXT: s_mul_i32 s2, s2, s3 +; GFX9-NEXT: s_mul_hi_u32 s2, s3, s2 +; GFX9-NEXT: s_add_i32 s3, s3, s2 +; GFX9-NEXT: s_mul_hi_u32 s2, s4, s3 +; GFX9-NEXT: s_mul_i32 s3, s2, s6 +; GFX9-NEXT: s_sub_i32 s3, s4, s3 +; GFX9-NEXT: s_add_i32 s9, s2, 1 +; GFX9-NEXT: s_sub_i32 s4, s3, s6 +; GFX9-NEXT: s_cmp_ge_u32 s3, s6 +; GFX9-NEXT: s_cselect_b32 s2, s9, s2 +; GFX9-NEXT: s_cselect_b32 s3, s4, s3 +; GFX9-NEXT: s_add_i32 s4, s2, 1 +; GFX9-NEXT: s_cmp_ge_u32 s3, s6 ; GFX9-NEXT: v_readfirstlane_b32 s8, v1 -; GFX9-NEXT: s_cselect_b32 s3, s7, s6 -; GFX9-NEXT: s_sub_i32 s4, 0, s2 -; GFX9-NEXT: s_mul_i32 s4, s4, s8 -; GFX9-NEXT: s_mul_hi_u32 s4, s8, s4 -; GFX9-NEXT: s_add_i32 s8, s8, s4 -; GFX9-NEXT: s_mul_hi_u32 s4, s5, s8 -; GFX9-NEXT: s_mul_i32 s6, s4, s2 -; GFX9-NEXT: s_sub_i32 s5, s5, s6 -; GFX9-NEXT: s_add_i32 s7, s4, 1 -; GFX9-NEXT: s_sub_i32 s6, s5, s2 -; GFX9-NEXT: s_cmp_ge_u32 s5, s2 -; GFX9-NEXT: s_cselect_b32 s4, s7, s4 -; GFX9-NEXT: s_cselect_b32 s5, s6, s5 -; GFX9-NEXT: s_add_i32 s6, s4, 1 -; GFX9-NEXT: s_cmp_ge_u32 s5, s2 -; GFX9-NEXT: s_cselect_b32 s2, s6, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: s_cselect_b32 s2, s4, s2 +; GFX9-NEXT: s_sub_i32 s3, 0, s7 +; GFX9-NEXT: s_mul_i32 s3, s3, s8 +; GFX9-NEXT: s_mul_hi_u32 s3, s8, s3 +; GFX9-NEXT: s_add_i32 s8, s8, s3 +; GFX9-NEXT: s_mul_hi_u32 s3, s5, s8 +; GFX9-NEXT: s_mul_i32 s4, s3, s7 +; GFX9-NEXT: s_sub_i32 s4, s5, s4 +; GFX9-NEXT: s_add_i32 s6, s3, 1 +; GFX9-NEXT: s_sub_i32 s5, s4, s7 +; GFX9-NEXT: s_cmp_ge_u32 s4, s7 +; GFX9-NEXT: s_cselect_b32 s3, s6, s3 +; GFX9-NEXT: s_cselect_b32 s4, s5, s4 +; GFX9-NEXT: s_add_i32 s5, s3, 1 +; GFX9-NEXT: s_cmp_ge_u32 s4, s7 +; GFX9-NEXT: s_cselect_b32 s3, s5, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %shl.y = shl <2 x i32> , %y @@ -5780,10 +5784,10 @@ define amdgpu_kernel void @urem_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX6-LABEL: urem_i32_oddk_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v0, 0xb2a50881 ; GFX6-NEXT: s_mov_b32 s2, 0x12d8fb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 @@ -5799,19 +5803,19 @@ define amdgpu_kernel void @urem_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX9-LABEL: urem_i32_oddk_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_hi_u32 s0, s4, 0xb2a50881 -; GFX9-NEXT: s_sub_i32 s1, s4, s0 -; GFX9-NEXT: s_lshr_b32 s1, s1, 1 -; GFX9-NEXT: s_add_i32 s1, s1, s0 -; GFX9-NEXT: s_lshr_b32 s0, s1, 20 -; GFX9-NEXT: s_mul_i32 s0, s0, 0x12d8fb -; GFX9-NEXT: s_sub_i32 s0, s4, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: s_mul_hi_u32 s2, s4, 0xb2a50881 +; GFX9-NEXT: s_sub_i32 s3, s4, s2 +; GFX9-NEXT: s_lshr_b32 s3, s3, 1 +; GFX9-NEXT: s_add_i32 s3, s3, s2 +; GFX9-NEXT: s_lshr_b32 s2, s3, 20 +; GFX9-NEXT: s_mul_i32 s2, s2, 0x12d8fb +; GFX9-NEXT: s_sub_i32 s2, s4, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm %r = urem i32 %x, 1235195 store i32 %r, ptr addrspace(1) %out @@ -5826,8 +5830,8 @@ define amdgpu_kernel void @urem_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX6-LABEL: urem_i32_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5838,13 +5842,13 @@ define amdgpu_kernel void @urem_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX9-LABEL: urem_i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, s4, 0xfff -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: s_and_b32 s2, s4, 0xfff +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm %r = urem i32 %x, 4096 store i32 %r, ptr addrspace(1) %out @@ -5860,7 +5864,7 @@ define amdgpu_kernel void @urem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; ; GFX6-LABEL: urem_i32_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5875,7 +5879,7 @@ define amdgpu_kernel void @urem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; ; GFX9-LABEL: urem_i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s0, 0x1000, s7 @@ -5903,7 +5907,7 @@ define amdgpu_kernel void @urem_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3 ; ; GFX6-LABEL: urem_v2i32_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5918,7 +5922,7 @@ define amdgpu_kernel void @urem_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3 ; ; GFX9-LABEL: urem_v2i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s0, s6, 0xfff @@ -6000,35 +6004,35 @@ define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX6-LABEL: urem_v2i32_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s6 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX6-NEXT: s_sub_i32 s3, 0, s2 +; GFX6-NEXT: s_lshl_b32 s0, 0x1000, s6 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX6-NEXT: s_sub_i32 s1, 0, s0 ; GFX6-NEXT: s_lshl_b32 s6, 0x1000, s7 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s6 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0 +; GFX6-NEXT: v_mul_lo_u32 v1, s1, v0 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_readfirstlane_b32 s3, v0 -; GFX6-NEXT: s_mul_i32 s3, s3, s2 -; GFX6-NEXT: s_sub_i32 s3, s4, s3 -; GFX6-NEXT: s_sub_i32 s4, s3, s2 -; GFX6-NEXT: s_cmp_ge_u32 s3, s2 -; GFX6-NEXT: s_cselect_b32 s3, s4, s3 -; GFX6-NEXT: s_sub_i32 s4, s3, s2 -; GFX6-NEXT: s_cmp_ge_u32 s3, s2 -; GFX6-NEXT: s_cselect_b32 s4, s4, s3 -; GFX6-NEXT: s_sub_i32 s2, 0, s6 -; GFX6-NEXT: v_mul_lo_u32 v0, s2, v1 +; GFX6-NEXT: v_readfirstlane_b32 s1, v0 +; GFX6-NEXT: s_mul_i32 s1, s1, s0 +; GFX6-NEXT: s_sub_i32 s1, s4, s1 +; GFX6-NEXT: s_sub_i32 s4, s1, s0 +; GFX6-NEXT: s_cmp_ge_u32 s1, s0 +; GFX6-NEXT: s_cselect_b32 s1, s4, s1 +; GFX6-NEXT: s_sub_i32 s4, s1, s0 +; GFX6-NEXT: s_cmp_ge_u32 s1, s0 +; GFX6-NEXT: s_cselect_b32 s4, s4, s1 +; GFX6-NEXT: s_sub_i32 s0, 0, s6 +; GFX6-NEXT: v_mul_lo_u32 v0, s0, v1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 @@ -6045,55 +6049,56 @@ define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_cselect_b32 s5, s7, s5 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: urem_v2i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX9-NEXT: s_lshl_b32 s2, 0x1000, s7 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2 -; GFX9-NEXT: s_sub_i32 s6, 0, s3 +; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX9-NEXT: s_lshl_b32 s7, 0x1000, s7 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 +; GFX9-NEXT: s_sub_i32 s2, 0, s6 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_readfirstlane_b32 s7, v0 -; GFX9-NEXT: s_mul_i32 s6, s6, s7 -; GFX9-NEXT: s_mul_hi_u32 s6, s7, s6 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_mul_hi_u32 s6, s4, s7 -; GFX9-NEXT: s_mul_i32 s6, s6, s3 -; GFX9-NEXT: s_sub_i32 s4, s4, s6 -; GFX9-NEXT: s_sub_i32 s6, s4, s3 -; GFX9-NEXT: s_cmp_ge_u32 s4, s3 -; GFX9-NEXT: s_cselect_b32 s4, s6, s4 -; GFX9-NEXT: s_sub_i32 s6, s4, s3 -; GFX9-NEXT: s_cmp_ge_u32 s4, s3 +; GFX9-NEXT: v_readfirstlane_b32 s3, v0 +; GFX9-NEXT: s_mul_i32 s2, s2, s3 +; GFX9-NEXT: s_mul_hi_u32 s2, s3, s2 +; GFX9-NEXT: s_add_i32 s3, s3, s2 +; GFX9-NEXT: s_mul_hi_u32 s2, s4, s3 +; GFX9-NEXT: s_mul_i32 s2, s2, s6 +; GFX9-NEXT: s_sub_i32 s2, s4, s2 +; GFX9-NEXT: s_sub_i32 s3, s2, s6 +; GFX9-NEXT: s_cmp_ge_u32 s2, s6 +; GFX9-NEXT: s_cselect_b32 s2, s3, s2 +; GFX9-NEXT: s_sub_i32 s3, s2, s6 +; GFX9-NEXT: s_cmp_ge_u32 s2, s6 ; GFX9-NEXT: v_readfirstlane_b32 s8, v1 -; GFX9-NEXT: s_cselect_b32 s3, s6, s4 -; GFX9-NEXT: s_sub_i32 s4, 0, s2 -; GFX9-NEXT: s_mul_i32 s4, s4, s8 -; GFX9-NEXT: s_mul_hi_u32 s4, s8, s4 -; GFX9-NEXT: s_add_i32 s8, s8, s4 -; GFX9-NEXT: s_mul_hi_u32 s4, s5, s8 -; GFX9-NEXT: s_mul_i32 s4, s4, s2 -; GFX9-NEXT: s_sub_i32 s4, s5, s4 -; GFX9-NEXT: s_sub_i32 s5, s4, s2 -; GFX9-NEXT: s_cmp_ge_u32 s4, s2 -; GFX9-NEXT: s_cselect_b32 s4, s5, s4 -; GFX9-NEXT: s_sub_i32 s5, s4, s2 -; GFX9-NEXT: s_cmp_ge_u32 s4, s2 -; GFX9-NEXT: s_cselect_b32 s2, s5, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: s_cselect_b32 s2, s3, s2 +; GFX9-NEXT: s_sub_i32 s3, 0, s7 +; GFX9-NEXT: s_mul_i32 s3, s3, s8 +; GFX9-NEXT: s_mul_hi_u32 s3, s8, s3 +; GFX9-NEXT: s_add_i32 s8, s8, s3 +; GFX9-NEXT: s_mul_hi_u32 s3, s5, s8 +; GFX9-NEXT: s_mul_i32 s3, s3, s7 +; GFX9-NEXT: s_sub_i32 s3, s5, s3 +; GFX9-NEXT: s_sub_i32 s4, s3, s7 +; GFX9-NEXT: s_cmp_ge_u32 s3, s7 +; GFX9-NEXT: s_cselect_b32 s3, s4, s3 +; GFX9-NEXT: s_sub_i32 s4, s3, s7 +; GFX9-NEXT: s_cmp_ge_u32 s3, s7 +; GFX9-NEXT: s_cselect_b32 s3, s4, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %shl.y = shl <2 x i32> , %y @@ -6110,8 +6115,8 @@ define amdgpu_kernel void @sdiv_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX6-LABEL: sdiv_i32_oddk_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v0, 0xd9528441 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -6126,17 +6131,17 @@ define amdgpu_kernel void @sdiv_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX9-LABEL: sdiv_i32_oddk_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_hi_i32 s0, s4, 0xd9528441 -; GFX9-NEXT: s_add_i32 s0, s0, s4 -; GFX9-NEXT: s_lshr_b32 s1, s0, 31 -; GFX9-NEXT: s_ashr_i32 s0, s0, 20 -; GFX9-NEXT: s_add_i32 s0, s0, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: s_mul_hi_i32 s2, s4, 0xd9528441 +; GFX9-NEXT: s_add_i32 s2, s2, s4 +; GFX9-NEXT: s_lshr_b32 s3, s2, 31 +; GFX9-NEXT: s_ashr_i32 s2, s2, 20 +; GFX9-NEXT: s_add_i32 s2, s2, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm %r = sdiv i32 %x, 1235195 store i32 %r, ptr addrspace(1) %out @@ -6151,8 +6156,8 @@ define amdgpu_kernel void @sdiv_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX6-LABEL: sdiv_i32_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6166,16 +6171,16 @@ define amdgpu_kernel void @sdiv_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX9-LABEL: sdiv_i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s0, s4, 31 -; GFX9-NEXT: s_lshr_b32 s0, s0, 20 -; GFX9-NEXT: s_add_i32 s4, s4, s0 -; GFX9-NEXT: s_ashr_i32 s0, s4, 12 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: s_ashr_i32 s2, s4, 31 +; GFX9-NEXT: s_lshr_b32 s2, s2, 20 +; GFX9-NEXT: s_add_i32 s4, s4, s2 +; GFX9-NEXT: s_ashr_i32 s2, s4, 12 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm %r = sdiv i32 %x, 4096 store i32 %r, ptr addrspace(1) %out @@ -6191,7 +6196,7 @@ define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; ; GFX6-LABEL: sdiv_i32_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6234,7 +6239,7 @@ define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; ; GFX9-LABEL: sdiv_i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s0, 0x1000, s7 @@ -6289,7 +6294,7 @@ define amdgpu_kernel void @sdiv_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3 ; ; GFX6-LABEL: sdiv_v2i32_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6310,7 +6315,7 @@ define amdgpu_kernel void @sdiv_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3 ; ; GFX9-LABEL: sdiv_v2i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s0, s6, 31 @@ -6343,7 +6348,7 @@ define amdgpu_kernel void @ssdiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out, ; ; GFX6-LABEL: ssdiv_v2i32_mixed_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v0, 0x80080081 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 @@ -6365,7 +6370,7 @@ define amdgpu_kernel void @ssdiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out, ; ; GFX9-LABEL: ssdiv_v2i32_mixed_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s0, s6, 31 @@ -6476,50 +6481,50 @@ define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX6-LABEL: sdiv_v2i32_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s6 -; GFX6-NEXT: s_abs_i32 s3, s2 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX6-NEXT: s_sub_i32 s6, 0, s3 -; GFX6-NEXT: s_xor_b32 s2, s4, s2 +; GFX6-NEXT: s_lshl_b32 s0, 0x1000, s6 +; GFX6-NEXT: s_abs_i32 s1, s0 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s1 +; GFX6-NEXT: s_sub_i32 s6, 0, s1 +; GFX6-NEXT: s_xor_b32 s0, s4, s0 ; GFX6-NEXT: s_lshl_b32 s7, 0x1000, s7 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_lo_u32 v1, s6, v0 ; GFX6-NEXT: s_abs_i32 s6, s4 -; GFX6-NEXT: s_ashr_i32 s4, s2, 31 +; GFX6-NEXT: s_ashr_i32 s4, s0, 31 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 -; GFX6-NEXT: v_readfirstlane_b32 s2, v0 -; GFX6-NEXT: s_mul_i32 s2, s2, s3 -; GFX6-NEXT: s_sub_i32 s2, s6, s2 -; GFX6-NEXT: s_sub_i32 s6, s2, s3 +; GFX6-NEXT: v_readfirstlane_b32 s0, v0 +; GFX6-NEXT: s_mul_i32 s0, s0, s1 +; GFX6-NEXT: s_sub_i32 s0, s6, s0 +; GFX6-NEXT: s_sub_i32 s6, s0, s1 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 -; GFX6-NEXT: s_cmp_ge_u32 s2, s3 +; GFX6-NEXT: s_cmp_ge_u32 s0, s1 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX6-NEXT: s_cselect_b32 s2, s6, s2 +; GFX6-NEXT: s_cselect_b32 s0, s6, s0 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 -; GFX6-NEXT: s_cmp_ge_u32 s2, s3 +; GFX6-NEXT: s_cmp_ge_u32 s0, s1 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX6-NEXT: s_abs_i32 s6, s7 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s6 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_sub_i32 s2, 0, s6 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX6-NEXT: s_xor_b32 s7, s5, s7 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GFX6-NEXT: s_xor_b32 s7, s5, s7 ; GFX6-NEXT: s_abs_i32 s5, s5 ; GFX6-NEXT: v_xor_b32_e32 v0, s4, v0 -; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s4, v0 ; GFX6-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s4, v0 ; GFX6-NEXT: s_ashr_i32 s7, s7, 31 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: v_mul_lo_u32 v3, s2, v2 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mul_hi_u32 v1, v2, v3 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 @@ -6539,71 +6544,73 @@ define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX6-NEXT: v_xor_b32_e32 v1, s7, v1 ; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s7, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_v2i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s2, 0x1000, s6 -; GFX9-NEXT: s_abs_i32 s3, s2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-NEXT: s_lshl_b32 s0, 0x1000, s6 +; GFX9-NEXT: s_abs_i32 s1, s0 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 +; GFX9-NEXT: s_xor_b32 s0, s4, s0 ; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s7 ; GFX9-NEXT: s_abs_i32 s7, s4 -; GFX9-NEXT: s_xor_b32 s2, s4, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_i32 s4, 0, s3 -; GFX9-NEXT: s_ashr_i32 s2, s2, 31 +; GFX9-NEXT: s_ashr_i32 s4, s0, 31 +; GFX9-NEXT: s_sub_i32 s0, 0, s1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s8, v0 -; GFX9-NEXT: s_mul_i32 s4, s4, s8 -; GFX9-NEXT: s_mul_hi_u32 s4, s8, s4 -; GFX9-NEXT: s_add_i32 s8, s8, s4 -; GFX9-NEXT: s_mul_hi_u32 s4, s7, s8 -; GFX9-NEXT: s_mul_i32 s8, s4, s3 +; GFX9-NEXT: s_mul_i32 s0, s0, s8 +; GFX9-NEXT: s_mul_hi_u32 s0, s8, s0 +; GFX9-NEXT: s_add_i32 s8, s8, s0 +; GFX9-NEXT: s_mul_hi_u32 s0, s7, s8 +; GFX9-NEXT: s_mul_i32 s8, s0, s1 ; GFX9-NEXT: s_sub_i32 s7, s7, s8 -; GFX9-NEXT: s_add_i32 s9, s4, 1 -; GFX9-NEXT: s_sub_i32 s8, s7, s3 -; GFX9-NEXT: s_cmp_ge_u32 s7, s3 -; GFX9-NEXT: s_cselect_b32 s4, s9, s4 +; GFX9-NEXT: s_add_i32 s9, s0, 1 +; GFX9-NEXT: s_sub_i32 s8, s7, s1 +; GFX9-NEXT: s_cmp_ge_u32 s7, s1 +; GFX9-NEXT: s_cselect_b32 s0, s9, s0 ; GFX9-NEXT: s_cselect_b32 s7, s8, s7 -; GFX9-NEXT: s_add_i32 s8, s4, 1 -; GFX9-NEXT: s_cmp_ge_u32 s7, s3 -; GFX9-NEXT: s_cselect_b32 s3, s8, s4 -; GFX9-NEXT: s_abs_i32 s4, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX9-NEXT: s_xor_b32 s3, s3, s2 -; GFX9-NEXT: s_sub_i32 s7, 0, s4 -; GFX9-NEXT: s_sub_i32 s2, s3, s2 +; GFX9-NEXT: s_add_i32 s8, s0, 1 +; GFX9-NEXT: s_cmp_ge_u32 s7, s1 +; GFX9-NEXT: s_cselect_b32 s7, s8, s0 +; GFX9-NEXT: s_abs_i32 s8, s6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_xor_b32 s2, s5, s6 +; GFX9-NEXT: s_abs_i32 s3, s5 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_xor_b32 s6, s5, s6 -; GFX9-NEXT: s_abs_i32 s5, s5 -; GFX9-NEXT: s_ashr_i32 s6, s6, 31 +; GFX9-NEXT: s_xor_b32 s5, s7, s4 +; GFX9-NEXT: s_sub_i32 s6, 0, s8 +; GFX9-NEXT: s_sub_i32 s4, s5, s4 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s3, v0 -; GFX9-NEXT: s_mul_i32 s7, s7, s3 -; GFX9-NEXT: s_mul_hi_u32 s7, s3, s7 -; GFX9-NEXT: s_add_i32 s3, s3, s7 -; GFX9-NEXT: s_mul_hi_u32 s3, s5, s3 -; GFX9-NEXT: s_mul_i32 s7, s3, s4 -; GFX9-NEXT: s_sub_i32 s5, s5, s7 -; GFX9-NEXT: s_add_i32 s8, s3, 1 -; GFX9-NEXT: s_sub_i32 s7, s5, s4 -; GFX9-NEXT: s_cmp_ge_u32 s5, s4 -; GFX9-NEXT: s_cselect_b32 s3, s8, s3 -; GFX9-NEXT: s_cselect_b32 s5, s7, s5 -; GFX9-NEXT: s_add_i32 s7, s3, 1 -; GFX9-NEXT: s_cmp_ge_u32 s5, s4 -; GFX9-NEXT: s_cselect_b32 s3, s7, s3 -; GFX9-NEXT: s_xor_b32 s3, s3, s6 +; GFX9-NEXT: s_ashr_i32 s2, s2, 31 +; GFX9-NEXT: v_readfirstlane_b32 s5, v0 +; GFX9-NEXT: s_mul_i32 s6, s6, s5 +; GFX9-NEXT: s_mul_hi_u32 s6, s5, s6 +; GFX9-NEXT: s_add_i32 s5, s5, s6 +; GFX9-NEXT: s_mul_hi_u32 s5, s3, s5 +; GFX9-NEXT: s_mul_i32 s6, s5, s8 ; GFX9-NEXT: s_sub_i32 s3, s3, s6 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_add_i32 s7, s5, 1 +; GFX9-NEXT: s_sub_i32 s6, s3, s8 +; GFX9-NEXT: s_cmp_ge_u32 s3, s8 +; GFX9-NEXT: s_cselect_b32 s5, s7, s5 +; GFX9-NEXT: s_cselect_b32 s3, s6, s3 +; GFX9-NEXT: s_add_i32 s6, s5, 1 +; GFX9-NEXT: s_cmp_ge_u32 s3, s8 +; GFX9-NEXT: s_cselect_b32 s3, s6, s5 +; GFX9-NEXT: s_xor_b32 s3, s3, s2 +; GFX9-NEXT: s_sub_i32 s2, s3, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %shl.y = shl <2 x i32> , %y @@ -6620,9 +6627,9 @@ define amdgpu_kernel void @srem_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX6-LABEL: srem_i32_oddk_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v0, 0xd9528441 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6640,19 +6647,19 @@ define amdgpu_kernel void @srem_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX9-LABEL: srem_i32_oddk_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_hi_i32 s0, s4, 0xd9528441 -; GFX9-NEXT: s_add_i32 s0, s0, s4 -; GFX9-NEXT: s_lshr_b32 s1, s0, 31 -; GFX9-NEXT: s_ashr_i32 s0, s0, 20 -; GFX9-NEXT: s_add_i32 s0, s0, s1 -; GFX9-NEXT: s_mul_i32 s0, s0, 0x12d8fb -; GFX9-NEXT: s_sub_i32 s0, s4, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: s_mul_hi_i32 s2, s4, 0xd9528441 +; GFX9-NEXT: s_add_i32 s2, s2, s4 +; GFX9-NEXT: s_lshr_b32 s3, s2, 31 +; GFX9-NEXT: s_ashr_i32 s2, s2, 20 +; GFX9-NEXT: s_add_i32 s2, s2, s3 +; GFX9-NEXT: s_mul_i32 s2, s2, 0x12d8fb +; GFX9-NEXT: s_sub_i32 s2, s4, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm %r = srem i32 %x, 1235195 store i32 %r, ptr addrspace(1) %out @@ -6667,8 +6674,8 @@ define amdgpu_kernel void @srem_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX6-LABEL: srem_i32_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6683,17 +6690,17 @@ define amdgpu_kernel void @srem_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX9-LABEL: srem_i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s0, s4, 31 -; GFX9-NEXT: s_lshr_b32 s0, s0, 20 -; GFX9-NEXT: s_add_i32 s0, s4, s0 -; GFX9-NEXT: s_and_b32 s0, s0, 0xfffff000 -; GFX9-NEXT: s_sub_i32 s0, s4, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: s_ashr_i32 s2, s4, 31 +; GFX9-NEXT: s_lshr_b32 s2, s2, 20 +; GFX9-NEXT: s_add_i32 s2, s4, s2 +; GFX9-NEXT: s_and_b32 s2, s2, 0xfffff000 +; GFX9-NEXT: s_sub_i32 s2, s4, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm %r = srem i32 %x, 4096 store i32 %r, ptr addrspace(1) %out @@ -6709,7 +6716,7 @@ define amdgpu_kernel void @srem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; ; GFX6-LABEL: srem_i32_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s3 ; GFX6-NEXT: s_ashr_i32 s4, s3, 31 @@ -6746,7 +6753,7 @@ define amdgpu_kernel void @srem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; ; GFX9-LABEL: srem_i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s0, 0x1000, s7 @@ -6798,7 +6805,7 @@ define amdgpu_kernel void @srem_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3 ; ; GFX6-LABEL: srem_v2i32_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6821,7 +6828,7 @@ define amdgpu_kernel void @srem_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3 ; ; GFX9-LABEL: srem_v2i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s0, s6, 31 @@ -6927,44 +6934,44 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX6-LABEL: srem_v2i32_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s6 -; GFX6-NEXT: s_abs_i32 s2, s2 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX6-NEXT: s_sub_i32 s3, 0, s2 +; GFX6-NEXT: s_lshl_b32 s0, 0x1000, s6 +; GFX6-NEXT: s_abs_i32 s0, s0 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX6-NEXT: s_sub_i32 s1, 0, s0 ; GFX6-NEXT: s_lshl_b32 s6, 0x1000, s7 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0 -; GFX6-NEXT: s_abs_i32 s3, s4 +; GFX6-NEXT: v_mul_lo_u32 v1, s1, v0 +; GFX6-NEXT: s_abs_i32 s1, s4 ; GFX6-NEXT: s_ashr_i32 s4, s4, 31 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 ; GFX6-NEXT: v_readfirstlane_b32 s7, v0 -; GFX6-NEXT: s_mul_i32 s7, s7, s2 -; GFX6-NEXT: s_sub_i32 s3, s3, s7 -; GFX6-NEXT: s_sub_i32 s7, s3, s2 -; GFX6-NEXT: s_cmp_ge_u32 s3, s2 -; GFX6-NEXT: s_cselect_b32 s3, s7, s3 -; GFX6-NEXT: s_sub_i32 s7, s3, s2 -; GFX6-NEXT: s_cmp_ge_u32 s3, s2 -; GFX6-NEXT: s_cselect_b32 s7, s7, s3 +; GFX6-NEXT: s_mul_i32 s7, s7, s0 +; GFX6-NEXT: s_sub_i32 s1, s1, s7 +; GFX6-NEXT: s_sub_i32 s7, s1, s0 +; GFX6-NEXT: s_cmp_ge_u32 s1, s0 +; GFX6-NEXT: s_cselect_b32 s1, s7, s1 +; GFX6-NEXT: s_sub_i32 s7, s1, s0 +; GFX6-NEXT: s_cmp_ge_u32 s1, s0 +; GFX6-NEXT: s_cselect_b32 s7, s7, s1 ; GFX6-NEXT: s_abs_i32 s6, s6 ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX6-NEXT: s_sub_i32 s2, 0, s6 +; GFX6-NEXT: s_sub_i32 s0, 0, s6 ; GFX6-NEXT: s_abs_i32 s8, s5 ; GFX6-NEXT: s_xor_b32 s7, s7, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: s_sub_i32 s4, s7, s4 ; GFX6-NEXT: s_ashr_i32 s5, s5, 31 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s2, v0 +; GFX6-NEXT: v_mul_lo_u32 v1, s0, v0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 @@ -6982,20 +6989,20 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_sub_i32 s5, s6, s5 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: srem_v2i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s2, 0x1000, s6 -; GFX9-NEXT: s_abs_i32 s2, s2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s7 -; GFX9-NEXT: s_sub_i32 s7, 0, s2 +; GFX9-NEXT: s_lshl_b32 s0, 0x1000, s6 +; GFX9-NEXT: s_abs_i32 s0, s0 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX9-NEXT: s_lshl_b32 s1, 0x1000, s7 +; GFX9-NEXT: s_sub_i32 s7, 0, s0 ; GFX9-NEXT: s_ashr_i32 s6, s4, 31 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_abs_i32 s4, s4 @@ -7006,41 +7013,43 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_mul_hi_u32 s7, s8, s7 ; GFX9-NEXT: s_add_i32 s8, s8, s7 ; GFX9-NEXT: s_mul_hi_u32 s7, s4, s8 -; GFX9-NEXT: s_mul_i32 s7, s7, s2 +; GFX9-NEXT: s_mul_i32 s7, s7, s0 ; GFX9-NEXT: s_sub_i32 s4, s4, s7 -; GFX9-NEXT: s_sub_i32 s7, s4, s2 -; GFX9-NEXT: s_cmp_ge_u32 s4, s2 +; GFX9-NEXT: s_sub_i32 s7, s4, s0 +; GFX9-NEXT: s_cmp_ge_u32 s4, s0 ; GFX9-NEXT: s_cselect_b32 s4, s7, s4 -; GFX9-NEXT: s_sub_i32 s7, s4, s2 -; GFX9-NEXT: s_cmp_ge_u32 s4, s2 -; GFX9-NEXT: s_cselect_b32 s2, s7, s4 -; GFX9-NEXT: s_abs_i32 s3, s3 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX9-NEXT: s_xor_b32 s2, s2, s6 -; GFX9-NEXT: s_sub_i32 s7, 0, s3 -; GFX9-NEXT: s_sub_i32 s2, s2, s6 +; GFX9-NEXT: s_sub_i32 s7, s4, s0 +; GFX9-NEXT: s_cmp_ge_u32 s4, s0 +; GFX9-NEXT: s_cselect_b32 s4, s7, s4 +; GFX9-NEXT: s_abs_i32 s7, s1 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX9-NEXT: s_xor_b32 s4, s4, s6 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_ashr_i32 s2, s5, 31 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_ashr_i32 s4, s5, 31 -; GFX9-NEXT: s_abs_i32 s5, s5 +; GFX9-NEXT: s_abs_i32 s3, s5 +; GFX9-NEXT: s_sub_i32 s5, 0, s7 +; GFX9-NEXT: s_sub_i32 s4, s4, s6 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s6, v0 -; GFX9-NEXT: s_mul_i32 s7, s7, s6 -; GFX9-NEXT: s_mul_hi_u32 s7, s6, s7 -; GFX9-NEXT: s_add_i32 s6, s6, s7 -; GFX9-NEXT: s_mul_hi_u32 s6, s5, s6 -; GFX9-NEXT: s_mul_i32 s6, s6, s3 -; GFX9-NEXT: s_sub_i32 s5, s5, s6 -; GFX9-NEXT: s_sub_i32 s6, s5, s3 -; GFX9-NEXT: s_cmp_ge_u32 s5, s3 -; GFX9-NEXT: s_cselect_b32 s5, s6, s5 -; GFX9-NEXT: s_sub_i32 s6, s5, s3 -; GFX9-NEXT: s_cmp_ge_u32 s5, s3 -; GFX9-NEXT: s_cselect_b32 s3, s6, s5 -; GFX9-NEXT: s_xor_b32 s3, s3, s4 -; GFX9-NEXT: s_sub_i32 s3, s3, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_mul_i32 s5, s5, s6 +; GFX9-NEXT: s_mul_hi_u32 s5, s6, s5 +; GFX9-NEXT: s_add_i32 s6, s6, s5 +; GFX9-NEXT: s_mul_hi_u32 s5, s3, s6 +; GFX9-NEXT: s_mul_i32 s5, s5, s7 +; GFX9-NEXT: s_sub_i32 s3, s3, s5 +; GFX9-NEXT: s_sub_i32 s5, s3, s7 +; GFX9-NEXT: s_cmp_ge_u32 s3, s7 +; GFX9-NEXT: s_cselect_b32 s3, s5, s3 +; GFX9-NEXT: s_sub_i32 s5, s3, s7 +; GFX9-NEXT: s_cmp_ge_u32 s3, s7 +; GFX9-NEXT: s_cselect_b32 s3, s5, s3 +; GFX9-NEXT: s_xor_b32 s3, s3, s2 +; GFX9-NEXT: s_sub_i32 s2, s3, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %shl.y = shl <2 x i32> , %y @@ -7069,7 +7078,7 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { ; GFX6-NEXT: v_mul_hi_u32 v2, v0, s4 ; GFX6-NEXT: s_addc_u32 s5, s5, 0 ; GFX6-NEXT: s_mul_i32 s6, s5, 0x68958c89 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GFX6-NEXT: v_mul_lo_u32 v2, v0, s4 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, s6, v1 @@ -7154,11 +7163,11 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX9-LABEL: udiv_i64_oddk_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_add_u32 s0, 3, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0xe3e0f6 ; GFX9-NEXT: s_addc_u32 s1, 0, 0 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: s_addc_u32 s0, s1, 0 @@ -7267,7 +7276,7 @@ define amdgpu_kernel void @udiv_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX6-LABEL: udiv_i64_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7281,7 +7290,7 @@ define amdgpu_kernel void @udiv_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX9-LABEL: udiv_i64_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], 12 @@ -7303,8 +7312,8 @@ define amdgpu_kernel void @udiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; ; GFX6-LABEL: udiv_i64_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX6-NEXT: s_load_dword s8, s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s8, s[2:3], 0xd ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7319,12 +7328,12 @@ define amdgpu_kernel void @udiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; ; GFX9-LABEL: udiv_i64_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_i32 s2, s2, 12 -; GFX9-NEXT: s_lshr_b64 s[0:1], s[6:7], s2 +; GFX9-NEXT: s_add_i32 s0, s0, 12 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[6:7], s0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] @@ -7348,8 +7357,8 @@ define amdgpu_kernel void @udiv_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i6 ; ; GFX6-LABEL: udiv_v2i64_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7364,17 +7373,17 @@ define amdgpu_kernel void @udiv_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i6 ; ; GFX9-LABEL: udiv_v2i64_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b64 s[0:1], s[4:5], 12 +; GFX9-NEXT: s_lshr_b64 s[2:3], s[4:5], 12 ; GFX9-NEXT: s_lshr_b64 s[4:5], s[6:7], 12 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm %r = udiv <2 x i64> %x, store <2 x i64> %r, ptr addrspace(1) %out @@ -7394,8 +7403,8 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(ptr addrspace(1) %out, < ; ; GFX6-LABEL: udiv_v2i64_mixed_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s2, 0x2ff2fc01 ; GFX6-NEXT: v_bfrev_b32_e32 v0, 7 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7481,13 +7490,13 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(ptr addrspace(1) %out, < ; ; GFX9-LABEL: udiv_v2i64_mixed_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s8, 0x2ff2fc01 ; GFX9-NEXT: v_bfrev_b32_e32 v0, 7 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b64 s[0:1], s[4:5], 12 +; GFX9-NEXT: s_lshr_b64 s[2:3], s[4:5], 12 ; GFX9-NEXT: s_add_u32 s4, 0xe037f, s8 ; GFX9-NEXT: s_addc_u32 s5, 0, 0 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v0 @@ -7570,9 +7579,9 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(ptr addrspace(1) %out, < ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm %r = udiv <2 x i64> %x, store <2 x i64> %r, ptr addrspace(1) %out @@ -7595,27 +7604,27 @@ define amdgpu_kernel void @udiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX6-LABEL: udiv_v2i64_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 +; GFX6-NEXT: s_mov_b32 s15, 0xf000 +; GFX6-NEXT: s_mov_b32 s14, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_add_i32 s8, s8, 12 -; GFX6-NEXT: s_add_i32 s9, s10, 12 -; GFX6-NEXT: s_lshr_b64 s[4:5], s[4:5], s8 -; GFX6-NEXT: s_lshr_b64 s[6:7], s[6:7], s9 -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 -; GFX6-NEXT: v_mov_b32_e32 v3, s7 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX6-NEXT: s_add_i32 s0, s8, 12 +; GFX6-NEXT: s_add_i32 s2, s10, 12 +; GFX6-NEXT: s_lshr_b64 s[0:1], s[4:5], s0 +; GFX6-NEXT: s_lshr_b64 s[2:3], s[6:7], s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s2 +; GFX6-NEXT: v_mov_b32_e32 v3, s3 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: udiv_v2i64_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_i32 s2, s8, 12 ; GFX9-NEXT: s_add_i32 s8, s10, 12 @@ -7641,12 +7650,12 @@ define amdgpu_kernel void @urem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX6-LABEL: urem_i64_oddk_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX6-NEXT: s_add_u32 s0, 4, 0 ; GFX6-NEXT: v_mov_b32_e32 v0, 0xe3e0fc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 ; GFX6-NEXT: s_addc_u32 s1, 0, 0 ; GFX6-NEXT: s_or_b32 s0, vcc_lo, vcc_hi +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 ; GFX6-NEXT: s_cmp_lg_u32 s0, 0 ; GFX6-NEXT: s_mov_b32 s0, 0x689e0837 ; GFX6-NEXT: s_movk_i32 s2, 0xfee0 @@ -7737,11 +7746,11 @@ define amdgpu_kernel void @urem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX9-LABEL: urem_i64_oddk_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_add_u32 s0, 4, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0xe3e0fc ; GFX9-NEXT: s_addc_u32 s1, 0, 0 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: s_addc_u32 s0, s1, 0 @@ -7848,7 +7857,7 @@ define amdgpu_kernel void @urem_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX6-LABEL: urem_i64_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: v_mov_b32_e32 v1, 0 @@ -7862,7 +7871,7 @@ define amdgpu_kernel void @urem_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX9-LABEL: urem_i64_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0xfff @@ -7883,8 +7892,8 @@ define amdgpu_kernel void @urem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; ; GFX6-LABEL: urem_i64_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX6-NEXT: s_load_dword s8, s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s8, s[2:3], 0xd ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7901,11 +7910,11 @@ define amdgpu_kernel void @urem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; ; GFX9-LABEL: urem_i64_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s2 +; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s0 ; GFX9-NEXT: s_add_u32 s0, s0, -1 ; GFX9-NEXT: s_addc_u32 s1, s1, -1 ; GFX9-NEXT: s_and_b64 s[0:1], s[6:7], s[0:1] @@ -7932,8 +7941,8 @@ define amdgpu_kernel void @urem_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i6 ; ; GFX6-LABEL: urem_v2i64_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v1, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -7948,16 +7957,16 @@ define amdgpu_kernel void @urem_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i6 ; ; GFX9-LABEL: urem_v2i64_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, s4, 0xfff -; GFX9-NEXT: s_and_b32 s1, s6, 0xfff -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: global_store_dwordx4 v1, v[0:3], s[2:3] +; GFX9-NEXT: s_and_b32 s2, s4, 0xfff +; GFX9-NEXT: s_and_b32 s3, s6, 0xfff +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm %r = urem <2 x i64> %x, store <2 x i64> %r, ptr addrspace(1) %out @@ -7980,31 +7989,31 @@ define amdgpu_kernel void @urem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX6-LABEL: urem_v2i64_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 +; GFX6-NEXT: s_mov_b32 s15, 0xf000 +; GFX6-NEXT: s_mov_b32 s14, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b64 s[10:11], 0x1000, s10 -; GFX6-NEXT: s_lshl_b64 s[8:9], 0x1000, s8 -; GFX6-NEXT: s_add_u32 s8, s8, -1 -; GFX6-NEXT: s_addc_u32 s9, s9, -1 -; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], s[8:9] -; GFX6-NEXT: s_add_u32 s8, s10, -1 -; GFX6-NEXT: s_addc_u32 s9, s11, -1 -; GFX6-NEXT: s_and_b64 s[6:7], s[6:7], s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 -; GFX6-NEXT: v_mov_b32_e32 v3, s7 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX6-NEXT: s_lshl_b64 s[0:1], 0x1000, s10 +; GFX6-NEXT: s_lshl_b64 s[2:3], 0x1000, s8 +; GFX6-NEXT: s_add_u32 s2, s2, -1 +; GFX6-NEXT: s_addc_u32 s3, s3, -1 +; GFX6-NEXT: s_and_b64 s[2:3], s[4:5], s[2:3] +; GFX6-NEXT: s_add_u32 s0, s0, -1 +; GFX6-NEXT: s_addc_u32 s1, s1, -1 +; GFX6-NEXT: s_and_b64 s[0:1], s[6:7], s[0:1] +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: v_mov_b32_e32 v3, s1 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: urem_v2i64_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[2:3], 0x1000, s10 ; GFX9-NEXT: s_lshl_b64 s[8:9], 0x1000, s8 @@ -8034,7 +8043,7 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX6-LABEL: sdiv_i64_oddk_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s0, 0x33fe64 ; GFX6-NEXT: s_add_u32 s1, 0x396, s0 ; GFX6-NEXT: v_mov_b32_e32 v0, 0x28100000 @@ -8150,7 +8159,7 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { ; GFX9-NEXT: s_mul_i32 s10, s4, s8 ; GFX9-NEXT: s_addc_u32 s8, 0, s11 ; GFX9-NEXT: s_add_u32 s6, s6, s10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: s_mul_hi_u32 s7, s4, s5 ; GFX9-NEXT: s_addc_u32 s6, s8, s9 ; GFX9-NEXT: s_addc_u32 s7, s7, 0 @@ -8234,7 +8243,7 @@ define amdgpu_kernel void @sdiv_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX6-LABEL: sdiv_i64_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8252,7 +8261,7 @@ define amdgpu_kernel void @sdiv_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX9-LABEL: sdiv_i64_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s4, s3, 31 @@ -8278,21 +8287,21 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; ; GFX6-LABEL: sdiv_i64_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s2, s[0:1], 0xd +; GFX6-NEXT: s_load_dword s0, s[2:3], 0xd ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b64 s[2:3], 0x1000, s2 -; GFX6-NEXT: s_ashr_i32 s8, s3, 31 -; GFX6-NEXT: s_add_u32 s2, s2, s8 +; GFX6-NEXT: s_lshl_b64 s[0:1], 0x1000, s0 +; GFX6-NEXT: s_ashr_i32 s8, s1, 31 +; GFX6-NEXT: s_add_u32 s0, s0, s8 ; GFX6-NEXT: s_mov_b32 s9, s8 -; GFX6-NEXT: s_addc_u32 s3, s3, s8 -; GFX6-NEXT: s_xor_b64 s[10:11], s[2:3], s[8:9] +; GFX6-NEXT: s_addc_u32 s1, s1, s8 +; GFX6-NEXT: s_xor_b64 s[10:11], s[0:1], s[8:9] ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s10 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s11 ; GFX6-NEXT: s_sub_u32 s4, 0, s10 ; GFX6-NEXT: s_subb_u32 s5, 0, s11 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8414,19 +8423,19 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; ; GFX9-LABEL: sdiv_i64_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[4:5], 0x1000, s2 -; GFX9-NEXT: s_ashr_i32 s2, s5, 31 -; GFX9-NEXT: s_add_u32 s4, s4, s2 -; GFX9-NEXT: s_mov_b32 s3, s2 -; GFX9-NEXT: s_addc_u32 s5, s5, s2 -; GFX9-NEXT: s_xor_b64 s[8:9], s[4:5], s[2:3] -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_sub_u32 s0, 0, s8 -; GFX9-NEXT: s_subb_u32 s1, 0, s9 +; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s0 +; GFX9-NEXT: s_ashr_i32 s8, s1, 31 +; GFX9-NEXT: s_add_u32 s0, s0, s8 +; GFX9-NEXT: s_mov_b32 s9, s8 +; GFX9-NEXT: s_addc_u32 s1, s1, s8 +; GFX9-NEXT: s_xor_b64 s[10:11], s[0:1], s[8:9] +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s10 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s11 +; GFX9-NEXT: s_sub_u32 s0, 0, s10 +; GFX9-NEXT: s_subb_u32 s1, 0, s11 ; GFX9-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GFX9-NEXT: v_rcp_f32_e32 v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -8436,61 +8445,60 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: v_madmk_f32 v1, v2, 0xcf800000, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_readfirstlane_b32 s10, v2 -; GFX9-NEXT: v_readfirstlane_b32 s11, v1 -; GFX9-NEXT: s_mul_i32 s12, s0, s10 -; GFX9-NEXT: s_mul_hi_u32 s14, s0, s11 -; GFX9-NEXT: s_mul_i32 s13, s1, s11 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_readfirstlane_b32 s3, v1 +; GFX9-NEXT: s_mul_i32 s12, s0, s2 +; GFX9-NEXT: s_mul_hi_u32 s14, s0, s3 +; GFX9-NEXT: s_mul_i32 s13, s1, s3 ; GFX9-NEXT: s_add_i32 s12, s14, s12 -; GFX9-NEXT: s_mul_i32 s15, s0, s11 +; GFX9-NEXT: s_mul_i32 s15, s0, s3 ; GFX9-NEXT: s_add_i32 s12, s12, s13 -; GFX9-NEXT: s_mul_hi_u32 s14, s11, s15 -; GFX9-NEXT: s_mul_hi_u32 s13, s11, s12 -; GFX9-NEXT: s_mul_i32 s11, s11, s12 -; GFX9-NEXT: s_add_u32 s11, s14, s11 +; GFX9-NEXT: s_mul_hi_u32 s14, s3, s15 +; GFX9-NEXT: s_mul_hi_u32 s13, s3, s12 +; GFX9-NEXT: s_mul_i32 s3, s3, s12 +; GFX9-NEXT: s_add_u32 s3, s14, s3 ; GFX9-NEXT: s_addc_u32 s13, 0, s13 -; GFX9-NEXT: s_mul_hi_u32 s16, s10, s15 -; GFX9-NEXT: s_mul_i32 s15, s10, s15 -; GFX9-NEXT: s_add_u32 s11, s11, s15 -; GFX9-NEXT: s_mul_hi_u32 s14, s10, s12 -; GFX9-NEXT: s_addc_u32 s11, s13, s16 +; GFX9-NEXT: s_mul_hi_u32 s16, s2, s15 +; GFX9-NEXT: s_mul_i32 s15, s2, s15 +; GFX9-NEXT: s_add_u32 s3, s3, s15 +; GFX9-NEXT: s_mul_hi_u32 s14, s2, s12 +; GFX9-NEXT: s_addc_u32 s3, s13, s16 ; GFX9-NEXT: s_addc_u32 s13, s14, 0 -; GFX9-NEXT: s_mul_i32 s12, s10, s12 -; GFX9-NEXT: s_add_u32 s11, s11, s12 +; GFX9-NEXT: s_mul_i32 s12, s2, s12 +; GFX9-NEXT: s_add_u32 s3, s3, s12 ; GFX9-NEXT: s_addc_u32 s12, 0, s13 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s11, v1 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s3, v1 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_addc_u32 s10, s10, s12 +; GFX9-NEXT: s_addc_u32 s2, s2, s12 ; GFX9-NEXT: v_readfirstlane_b32 s12, v1 -; GFX9-NEXT: s_mul_i32 s11, s0, s10 +; GFX9-NEXT: s_mul_i32 s3, s0, s2 ; GFX9-NEXT: s_mul_hi_u32 s13, s0, s12 -; GFX9-NEXT: s_add_i32 s11, s13, s11 +; GFX9-NEXT: s_add_i32 s3, s13, s3 ; GFX9-NEXT: s_mul_i32 s1, s1, s12 -; GFX9-NEXT: s_add_i32 s11, s11, s1 +; GFX9-NEXT: s_add_i32 s3, s3, s1 ; GFX9-NEXT: s_mul_i32 s0, s0, s12 -; GFX9-NEXT: s_mul_hi_u32 s13, s10, s0 -; GFX9-NEXT: s_mul_i32 s14, s10, s0 -; GFX9-NEXT: s_mul_i32 s16, s12, s11 +; GFX9-NEXT: s_mul_hi_u32 s13, s2, s0 +; GFX9-NEXT: s_mul_i32 s14, s2, s0 +; GFX9-NEXT: s_mul_i32 s16, s12, s3 ; GFX9-NEXT: s_mul_hi_u32 s0, s12, s0 -; GFX9-NEXT: s_mul_hi_u32 s15, s12, s11 +; GFX9-NEXT: s_mul_hi_u32 s15, s12, s3 ; GFX9-NEXT: s_add_u32 s0, s0, s16 ; GFX9-NEXT: s_addc_u32 s12, 0, s15 ; GFX9-NEXT: s_add_u32 s0, s0, s14 -; GFX9-NEXT: s_mul_hi_u32 s1, s10, s11 +; GFX9-NEXT: s_mul_hi_u32 s1, s2, s3 ; GFX9-NEXT: s_addc_u32 s0, s12, s13 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-NEXT: s_mul_i32 s11, s10, s11 -; GFX9-NEXT: s_add_u32 s0, s0, s11 +; GFX9-NEXT: s_mul_i32 s3, s2, s3 +; GFX9-NEXT: s_add_u32 s0, s0, s3 ; GFX9-NEXT: s_addc_u32 s1, 0, s1 ; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s0, v1 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_addc_u32 s12, s10, s1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s10, s7, 31 -; GFX9-NEXT: s_add_u32 s0, s6, s10 -; GFX9-NEXT: s_mov_b32 s11, s10 -; GFX9-NEXT: s_addc_u32 s1, s7, s10 -; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] +; GFX9-NEXT: s_addc_u32 s12, s2, s1 +; GFX9-NEXT: s_ashr_i32 s2, s7, 31 +; GFX9-NEXT: s_add_u32 s0, s6, s2 +; GFX9-NEXT: s_mov_b32 s3, s2 +; GFX9-NEXT: s_addc_u32 s1, s7, s2 +; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[2:3] ; GFX9-NEXT: v_readfirstlane_b32 s13, v1 ; GFX9-NEXT: s_mul_i32 s1, s6, s12 ; GFX9-NEXT: s_mul_hi_u32 s14, s6, s13 @@ -8506,24 +8514,24 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: s_mul_i32 s12, s7, s12 ; GFX9-NEXT: s_add_u32 s12, s0, s12 ; GFX9-NEXT: s_addc_u32 s13, 0, s1 -; GFX9-NEXT: s_mul_i32 s0, s8, s13 -; GFX9-NEXT: s_mul_hi_u32 s1, s8, s12 +; GFX9-NEXT: s_mul_i32 s0, s10, s13 +; GFX9-NEXT: s_mul_hi_u32 s1, s10, s12 ; GFX9-NEXT: s_add_i32 s0, s1, s0 -; GFX9-NEXT: s_mul_i32 s1, s9, s12 +; GFX9-NEXT: s_mul_i32 s1, s11, s12 ; GFX9-NEXT: s_add_i32 s14, s0, s1 -; GFX9-NEXT: s_mul_i32 s1, s8, s12 +; GFX9-NEXT: s_mul_i32 s1, s10, s12 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_sub_i32 s0, s7, s14 ; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s6, v1 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_subb_u32 s6, s0, s9 -; GFX9-NEXT: v_subrev_co_u32_e64 v2, s[0:1], s8, v1 +; GFX9-NEXT: s_subb_u32 s6, s0, s11 +; GFX9-NEXT: v_subrev_co_u32_e64 v2, s[0:1], s10, v1 ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_subb_u32 s6, s6, 0 -; GFX9-NEXT: s_cmp_ge_u32 s6, s9 +; GFX9-NEXT: s_cmp_ge_u32 s6, s11 ; GFX9-NEXT: s_cselect_b32 s15, -1, 0 -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v2 -; GFX9-NEXT: s_cmp_eq_u32 s6, s9 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v2 +; GFX9-NEXT: s_cmp_eq_u32 s6, s11 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v3, s15 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 @@ -8541,10 +8549,10 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] ; GFX9-NEXT: s_subb_u32 s0, s7, s14 -; GFX9-NEXT: s_cmp_ge_u32 s0, s9 +; GFX9-NEXT: s_cmp_ge_u32 s0, s11 ; GFX9-NEXT: s_cselect_b32 s1, -1, 0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v1 -; GFX9-NEXT: s_cmp_eq_u32 s0, s9 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v1 +; GFX9-NEXT: s_cmp_eq_u32 s0, s11 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; GFX9-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 @@ -8554,7 +8562,7 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc ; GFX9-NEXT: v_mov_b32_e32 v3, s12 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX9-NEXT: s_xor_b64 s[0:1], s[10:11], s[2:3] +; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], s[8:9] ; GFX9-NEXT: v_xor_b32_e32 v2, s0, v2 ; GFX9-NEXT: v_xor_b32_e32 v3, s1, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, s1 @@ -8581,8 +8589,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i6 ; ; GFX6-LABEL: sdiv_v2i64_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8605,25 +8613,25 @@ define amdgpu_kernel void @sdiv_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i6 ; ; GFX9-LABEL: sdiv_v2i64_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s0, s5, 31 -; GFX9-NEXT: s_lshr_b32 s0, s0, 20 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, 0 +; GFX9-NEXT: s_ashr_i32 s2, s5, 31 +; GFX9-NEXT: s_lshr_b32 s2, s2, 20 +; GFX9-NEXT: s_add_u32 s2, s4, s2 +; GFX9-NEXT: s_addc_u32 s3, s5, 0 ; GFX9-NEXT: s_ashr_i32 s4, s7, 31 -; GFX9-NEXT: s_ashr_i64 s[0:1], s[0:1], 12 +; GFX9-NEXT: s_ashr_i64 s[2:3], s[2:3], 12 ; GFX9-NEXT: s_lshr_b32 s4, s4, 20 ; GFX9-NEXT: s_add_u32 s4, s6, s4 ; GFX9-NEXT: s_addc_u32 s5, s7, 0 ; GFX9-NEXT: s_ashr_i64 s[4:5], s[4:5], 12 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm %r = sdiv <2 x i64> %x, store <2 x i64> %r, ptr addrspace(1) %out @@ -8643,8 +8651,8 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(ptr addrspace(1) %out, ; ; GFX6-LABEL: ssdiv_v2i64_mixed_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s2, 0x2ff2fc01 ; GFX6-NEXT: v_bfrev_b32_e32 v0, 7 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -8744,17 +8752,17 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(ptr addrspace(1) %out, ; ; GFX9-LABEL: ssdiv_v2i64_mixed_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s8, 0x2ff2fc01 ; GFX9-NEXT: v_bfrev_b32_e32 v0, 7 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s0, s5, 31 -; GFX9-NEXT: s_lshr_b32 s0, s0, 20 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, 0 -; GFX9-NEXT: s_ashr_i64 s[0:1], s[0:1], 12 +; GFX9-NEXT: s_ashr_i32 s2, s5, 31 +; GFX9-NEXT: s_lshr_b32 s2, s2, 20 +; GFX9-NEXT: s_add_u32 s2, s4, s2 +; GFX9-NEXT: s_addc_u32 s3, s5, 0 +; GFX9-NEXT: s_ashr_i64 s[2:3], s[2:3], 12 ; GFX9-NEXT: s_add_u32 s4, 0xe037f, s8 ; GFX9-NEXT: s_addc_u32 s5, 0, 0 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v0 @@ -8838,11 +8846,11 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(ptr addrspace(1) %out, ; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5] ; GFX9-NEXT: s_sub_u32 s5, s6, s4 ; GFX9-NEXT: s_subb_u32 s4, s7, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm %r = sdiv <2 x i64> %x, store <2 x i64> %r, ptr addrspace(1) %out @@ -8865,37 +8873,36 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX6-LABEL: sdiv_v2i64_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b64 s[2:3], 0x1000, s8 -; GFX6-NEXT: s_lshl_b64 s[12:13], 0x1000, s10 -; GFX6-NEXT: s_ashr_i32 s14, s3, 31 -; GFX6-NEXT: s_add_u32 s2, s2, s14 -; GFX6-NEXT: s_mov_b32 s15, s14 -; GFX6-NEXT: s_addc_u32 s3, s3, s14 -; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[14:15] -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GFX6-NEXT: s_sub_u32 s10, 0, s2 -; GFX6-NEXT: s_subb_u32 s11, 0, s3 -; GFX6-NEXT: s_ashr_i32 s16, s5, 31 +; GFX6-NEXT: s_mov_b32 s11, 0xf000 +; GFX6-NEXT: s_lshl_b64 s[0:1], 0x1000, s8 +; GFX6-NEXT: s_lshl_b64 s[14:15], 0x1000, s10 +; GFX6-NEXT: s_ashr_i32 s16, s1, 31 +; GFX6-NEXT: s_add_u32 s0, s0, s16 +; GFX6-NEXT: s_mov_b32 s17, s16 +; GFX6-NEXT: s_addc_u32 s1, s1, s16 +; GFX6-NEXT: s_xor_b64 s[12:13], s[0:1], s[16:17] +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s12 +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s13 +; GFX6-NEXT: s_sub_u32 s0, 0, s12 +; GFX6-NEXT: s_subb_u32 s1, 0, s13 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 ; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 -; GFX6-NEXT: s_add_u32 s0, s4, s16 -; GFX6-NEXT: s_mov_b32 s17, s16 +; GFX6-NEXT: s_ashr_i32 s2, s5, 31 +; GFX6-NEXT: s_mov_b32 s3, s2 +; GFX6-NEXT: s_mov_b32 s10, -1 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: s_addc_u32 s1, s5, s16 -; GFX6-NEXT: s_xor_b64 s[4:5], s[0:1], s[16:17] -; GFX6-NEXT: v_mul_lo_u32 v2, s10, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s10, v0 -; GFX6-NEXT: v_mul_lo_u32 v5, s11, v0 -; GFX6-NEXT: v_mul_lo_u32 v4, s10, v0 +; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s0, v0 +; GFX6-NEXT: v_mul_lo_u32 v5, s1, v0 +; GFX6-NEXT: v_mul_lo_u32 v4, s0, v0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 @@ -8914,12 +8921,11 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, s10, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s10, v0 -; GFX6-NEXT: v_mul_lo_u32 v4, s11, v0 -; GFX6-NEXT: s_mov_b32 s11, 0xf000 +; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s0, v0 +; GFX6-NEXT: v_mul_lo_u32 v4, s1, v0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_mul_lo_u32 v3, s10, v0 +; GFX6-NEXT: v_mul_lo_u32 v3, s0, v0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 @@ -8935,8 +8941,11 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GFX6-NEXT: s_add_u32 s0, s4, s2 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: s_addc_u32 s1, s5, s2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], s[0:1], s[2:3] ; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1 ; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 ; GFX6-NEXT: v_mul_hi_u32 v4, s4, v1 @@ -8946,29 +8955,28 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_mul_lo_u32 v4, s5, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 -; GFX6-NEXT: s_mov_b32 s10, -1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 -; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0 -; GFX6-NEXT: v_mov_b32_e32 v5, s3 +; GFX6-NEXT: v_mul_lo_u32 v2, s12, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s12, v0 +; GFX6-NEXT: v_mul_lo_u32 v4, s13, v0 +; GFX6-NEXT: v_mov_b32_e32 v5, s13 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_mul_lo_u32 v3, s2, v0 +; GFX6-NEXT: v_mul_lo_u32 v3, s12, v0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s5, v2 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s4, v3 ; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc -; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s2, v3 +; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s12, v3 ; GFX6-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v4 +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v5 +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v5 ; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v4 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] ; GFX6-NEXT: v_add_i32_e64 v5, s[0:1], 1, v0 ; GFX6-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1] @@ -8977,23 +8985,23 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v4, v5, v7, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v5, v6, v8, s[0:1] -; GFX6-NEXT: s_xor_b64 s[0:1], s[16:17], s[14:15] -; GFX6-NEXT: s_ashr_i32 s4, s13, 31 -; GFX6-NEXT: s_add_u32 s12, s12, s4 +; GFX6-NEXT: s_xor_b64 s[0:1], s[2:3], s[16:17] +; GFX6-NEXT: s_ashr_i32 s2, s15, 31 +; GFX6-NEXT: s_add_u32 s4, s14, s2 ; GFX6-NEXT: v_mov_b32_e32 v6, s5 -; GFX6-NEXT: s_mov_b32 s5, s4 -; GFX6-NEXT: s_addc_u32 s13, s13, s4 -; GFX6-NEXT: s_xor_b64 s[12:13], s[12:13], s[4:5] +; GFX6-NEXT: s_mov_b32 s3, s2 +; GFX6-NEXT: s_addc_u32 s5, s15, s2 +; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3] ; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v6, v2, vcc -; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s12 -; GFX6-NEXT: v_cvt_f32_u32_e32 v7, s13 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 +; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s4 +; GFX6-NEXT: v_cvt_f32_u32_e32 v7, s5 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s13, v2 ; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s2, v3 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s12, v3 ; GFX6-NEXT: v_mac_f32_e32 v6, 0x4f800000, v7 ; GFX6-NEXT: v_rcp_f32_e32 v6, v6 ; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s3, v2 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s13, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v8, v3, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GFX6-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v6 @@ -9002,16 +9010,16 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX6-NEXT: s_sub_u32 s2, 0, s12 +; GFX6-NEXT: s_sub_u32 s12, 0, s4 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX6-NEXT: v_mul_hi_u32 v4, s2, v2 -; GFX6-NEXT: v_mul_lo_u32 v5, s2, v3 -; GFX6-NEXT: s_subb_u32 s3, 0, s13 -; GFX6-NEXT: v_mul_lo_u32 v6, s3, v2 +; GFX6-NEXT: v_mul_hi_u32 v4, s12, v2 +; GFX6-NEXT: v_mul_lo_u32 v5, s12, v3 +; GFX6-NEXT: s_subb_u32 s13, 0, s5 +; GFX6-NEXT: v_mul_lo_u32 v6, s13, v2 ; GFX6-NEXT: v_xor_b32_e32 v0, s0, v0 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GFX6-NEXT: v_mul_lo_u32 v5, s2, v2 +; GFX6-NEXT: v_mul_lo_u32 v5, s12, v2 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; GFX6-NEXT: v_mul_lo_u32 v6, v2, v4 ; GFX6-NEXT: v_mul_hi_u32 v7, v2, v5 @@ -9030,11 +9038,11 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc -; GFX6-NEXT: v_mul_lo_u32 v4, s2, v3 -; GFX6-NEXT: v_mul_hi_u32 v5, s2, v2 -; GFX6-NEXT: v_mul_lo_u32 v6, s3, v2 +; GFX6-NEXT: v_mul_lo_u32 v4, s12, v3 +; GFX6-NEXT: v_mul_hi_u32 v5, s12, v2 +; GFX6-NEXT: v_mul_lo_u32 v6, s13, v2 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GFX6-NEXT: v_mul_lo_u32 v5, s2, v2 +; GFX6-NEXT: v_mul_lo_u32 v5, s12, v2 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; GFX6-NEXT: v_mul_lo_u32 v8, v2, v4 ; GFX6-NEXT: v_mul_hi_u32 v9, v2, v5 @@ -9049,14 +9057,14 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v9, v7, vcc ; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GFX6-NEXT: s_ashr_i32 s2, s7, 31 +; GFX6-NEXT: s_ashr_i32 s12, s7, 31 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc -; GFX6-NEXT: s_add_u32 s6, s6, s2 +; GFX6-NEXT: s_add_u32 s6, s6, s12 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GFX6-NEXT: s_mov_b32 s3, s2 -; GFX6-NEXT: s_addc_u32 s7, s7, s2 +; GFX6-NEXT: s_mov_b32 s13, s12 +; GFX6-NEXT: s_addc_u32 s7, s7, s12 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc -; GFX6-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3] +; GFX6-NEXT: s_xor_b64 s[6:7], s[6:7], s[12:13] ; GFX6-NEXT: v_mul_lo_u32 v4, s6, v3 ; GFX6-NEXT: v_mul_hi_u32 v5, s6, v2 ; GFX6-NEXT: v_mul_hi_u32 v7, s6, v3 @@ -9072,25 +9080,25 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GFX6-NEXT: v_mul_lo_u32 v4, s12, v3 -; GFX6-NEXT: v_mul_hi_u32 v5, s12, v2 +; GFX6-NEXT: v_mul_lo_u32 v4, s4, v3 +; GFX6-NEXT: v_mul_hi_u32 v5, s4, v2 ; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 ; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc -; GFX6-NEXT: v_mul_lo_u32 v6, s13, v2 +; GFX6-NEXT: v_mul_lo_u32 v6, s5, v2 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GFX6-NEXT: v_mul_lo_u32 v5, s12, v2 +; GFX6-NEXT: v_mul_lo_u32 v5, s4, v2 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v6, v4 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s7, v4 -; GFX6-NEXT: v_mov_b32_e32 v7, s13 +; GFX6-NEXT: v_mov_b32_e32 v7, s5 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s6, v5 ; GFX6-NEXT: v_subb_u32_e64 v6, s[0:1], v6, v7, vcc -; GFX6-NEXT: v_subrev_i32_e64 v7, s[0:1], s12, v5 +; GFX6-NEXT: v_subrev_i32_e64 v7, s[0:1], s4, v5 ; GFX6-NEXT: v_subbrev_u32_e64 v6, s[0:1], 0, v6, s[0:1] -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v6 +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v6 ; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v7 +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v7 ; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v6 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, v6 ; GFX6-NEXT: v_cndmask_b32_e64 v6, v8, v7, s[0:1] ; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 1, v2 ; GFX6-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v3, s[0:1] @@ -9101,15 +9109,15 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_cndmask_b32_e64 v7, v8, v10, s[0:1] ; GFX6-NEXT: v_mov_b32_e32 v8, s7 ; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v8, v4, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s13, v4 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s5, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s12, v5 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s4, v5 ; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s13, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s5, v4 ; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v5, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; GFX6-NEXT: s_xor_b64 s[0:1], s[2:3], s[4:5] +; GFX6-NEXT: s_xor_b64 s[0:1], s[12:13], s[2:3] ; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc ; GFX6-NEXT: v_xor_b32_e32 v2, s0, v2 ; GFX6-NEXT: v_xor_b32_e32 v3, s1, v3 @@ -9122,19 +9130,19 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX9-LABEL: sdiv_v2i64_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[2:3], 0x1000, s8 +; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s8 ; GFX9-NEXT: s_lshl_b64 s[10:11], 0x1000, s10 -; GFX9-NEXT: s_ashr_i32 s8, s3, 31 -; GFX9-NEXT: s_add_u32 s2, s2, s8 +; GFX9-NEXT: s_ashr_i32 s8, s1, 31 +; GFX9-NEXT: s_add_u32 s0, s0, s8 ; GFX9-NEXT: s_mov_b32 s9, s8 -; GFX9-NEXT: s_addc_u32 s3, s3, s8 -; GFX9-NEXT: s_xor_b64 s[12:13], s[2:3], s[8:9] +; GFX9-NEXT: s_addc_u32 s1, s1, s8 +; GFX9-NEXT: s_xor_b64 s[12:13], s[0:1], s[8:9] ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s12 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s13 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_sub_u32 s0, 0, s12 ; GFX9-NEXT: s_subb_u32 s1, 0, s13 ; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 @@ -9408,7 +9416,6 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: v_mov_b32_e32 v6, s1 ; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s0, v3 ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v5, v6, vcc -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX9-NEXT: s_endpgm %shl.y = shl <2 x i64> , %y @@ -9425,7 +9432,6 @@ define amdgpu_kernel void @srem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX6-LABEL: srem_i64_oddk_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s0, 0x33fe64 ; GFX6-NEXT: s_add_u32 s0, 0x396, s0 ; GFX6-NEXT: v_mov_b32_e32 v0, 0x28100000 @@ -9445,6 +9451,7 @@ define amdgpu_kernel void @srem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { ; GFX6-NEXT: v_mul_hi_u32 v7, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v4, s1, v2 ; GFX6-NEXT: v_mul_lo_u32 v2, s1, v2 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; GFX6-NEXT: v_mul_hi_u32 v3, s1, v1 ; GFX6-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v7, vcc @@ -9539,7 +9546,7 @@ define amdgpu_kernel void @srem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { ; GFX9-NEXT: s_mul_i32 s10, s4, s8 ; GFX9-NEXT: s_addc_u32 s8, 0, s11 ; GFX9-NEXT: s_add_u32 s6, s6, s10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: s_mul_hi_u32 s7, s4, s5 ; GFX9-NEXT: s_addc_u32 s6, s8, s9 ; GFX9-NEXT: s_addc_u32 s7, s7, 0 @@ -9626,7 +9633,7 @@ define amdgpu_kernel void @srem_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX6-LABEL: srem_i64_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9646,7 +9653,7 @@ define amdgpu_kernel void @srem_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX9-LABEL: srem_i64_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s4, s3, 31 @@ -9674,21 +9681,21 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; ; GFX6-LABEL: srem_i64_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s2, s[0:1], 0xd +; GFX6-NEXT: s_load_dword s0, s[2:3], 0xd ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b64 s[2:3], 0x1000, s2 -; GFX6-NEXT: s_ashr_i32 s4, s3, 31 -; GFX6-NEXT: s_add_u32 s2, s2, s4 +; GFX6-NEXT: s_lshl_b64 s[0:1], 0x1000, s0 +; GFX6-NEXT: s_ashr_i32 s4, s1, 31 +; GFX6-NEXT: s_add_u32 s0, s0, s4 ; GFX6-NEXT: s_mov_b32 s5, s4 -; GFX6-NEXT: s_addc_u32 s3, s3, s4 -; GFX6-NEXT: s_xor_b64 s[8:9], s[2:3], s[4:5] +; GFX6-NEXT: s_addc_u32 s1, s1, s4 +; GFX6-NEXT: s_xor_b64 s[8:9], s[0:1], s[4:5] ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 ; GFX6-NEXT: s_sub_u32 s4, 0, s8 ; GFX6-NEXT: s_subb_u32 s5, 0, s9 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9808,17 +9815,17 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; ; GFX9-LABEL: srem_i64_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[2:3], 0x1000, s2 -; GFX9-NEXT: s_ashr_i32 s4, s3, 31 -; GFX9-NEXT: s_add_u32 s2, s2, s4 +; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s0 +; GFX9-NEXT: s_ashr_i32 s4, s1, 31 +; GFX9-NEXT: s_add_u32 s0, s0, s4 ; GFX9-NEXT: s_mov_b32 s5, s4 -; GFX9-NEXT: s_addc_u32 s3, s3, s4 -; GFX9-NEXT: s_xor_b64 s[8:9], s[2:3], s[4:5] +; GFX9-NEXT: s_addc_u32 s1, s1, s4 +; GFX9-NEXT: s_xor_b64 s[8:9], s[0:1], s[4:5] ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: s_sub_u32 s0, 0, s8 ; GFX9-NEXT: s_subb_u32 s1, 0, s9 ; GFX9-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 @@ -9972,8 +9979,8 @@ define amdgpu_kernel void @srem_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i6 ; ; GFX6-LABEL: srem_v2i64_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -10000,17 +10007,17 @@ define amdgpu_kernel void @srem_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i6 ; ; GFX9-LABEL: srem_v2i64_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s0, s5, 31 -; GFX9-NEXT: s_lshr_b32 s0, s0, 20 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, 0 -; GFX9-NEXT: s_and_b32 s0, s0, 0xfffff000 -; GFX9-NEXT: s_sub_u32 s0, s4, s0 -; GFX9-NEXT: s_subb_u32 s1, s5, s1 +; GFX9-NEXT: s_ashr_i32 s2, s5, 31 +; GFX9-NEXT: s_lshr_b32 s2, s2, 20 +; GFX9-NEXT: s_add_u32 s2, s4, s2 +; GFX9-NEXT: s_addc_u32 s3, s5, 0 +; GFX9-NEXT: s_and_b32 s2, s2, 0xfffff000 +; GFX9-NEXT: s_sub_u32 s2, s4, s2 +; GFX9-NEXT: s_subb_u32 s3, s5, s3 ; GFX9-NEXT: s_ashr_i32 s4, s7, 31 ; GFX9-NEXT: s_lshr_b32 s4, s4, 20 ; GFX9-NEXT: s_add_u32 s4, s6, s4 @@ -10018,11 +10025,11 @@ define amdgpu_kernel void @srem_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i6 ; GFX9-NEXT: s_and_b32 s4, s4, 0xfffff000 ; GFX9-NEXT: s_sub_u32 s4, s6, s4 ; GFX9-NEXT: s_subb_u32 s5, s7, s5 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm %r = srem <2 x i64> %x, store <2 x i64> %r, ptr addrspace(1) %out @@ -10045,39 +10052,36 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX6-LABEL: srem_v2i64_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NEXT: s_lshl_b64 s[2:3], 0x1000, s8 +; GFX6-NEXT: s_lshl_b64 s[0:1], 0x1000, s8 ; GFX6-NEXT: s_lshl_b64 s[16:17], 0x1000, s10 -; GFX6-NEXT: s_ashr_i32 s8, s3, 31 -; GFX6-NEXT: s_add_u32 s2, s2, s8 +; GFX6-NEXT: s_ashr_i32 s8, s1, 31 +; GFX6-NEXT: s_add_u32 s0, s0, s8 ; GFX6-NEXT: s_mov_b32 s9, s8 -; GFX6-NEXT: s_addc_u32 s3, s3, s8 -; GFX6-NEXT: s_xor_b64 s[14:15], s[2:3], s[8:9] +; GFX6-NEXT: s_addc_u32 s1, s1, s8 +; GFX6-NEXT: s_xor_b64 s[14:15], s[0:1], s[8:9] ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s14 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s15 -; GFX6-NEXT: s_sub_u32 s2, 0, s14 -; GFX6-NEXT: s_subb_u32 s3, 0, s15 +; GFX6-NEXT: s_sub_u32 s0, 0, s14 +; GFX6-NEXT: s_subb_u32 s1, 0, s15 ; GFX6-NEXT: s_ashr_i32 s12, s5, 31 ; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 -; GFX6-NEXT: s_add_u32 s0, s4, s12 ; GFX6-NEXT: s_mov_b32 s13, s12 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 +; GFX6-NEXT: s_mov_b32 s10, -1 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: s_addc_u32 s1, s5, s12 -; GFX6-NEXT: s_xor_b64 s[4:5], s[0:1], s[12:13] -; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 -; GFX6-NEXT: v_mul_lo_u32 v5, s3, v0 -; GFX6-NEXT: v_mul_lo_u32 v4, s2, v0 -; GFX6-NEXT: s_mov_b32 s10, -1 +; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s0, v0 +; GFX6-NEXT: v_mul_lo_u32 v5, s1, v0 +; GFX6-NEXT: v_mul_lo_u32 v4, s0, v0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 @@ -10096,11 +10100,11 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 -; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0 +; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s0, v0 +; GFX6-NEXT: v_mul_lo_u32 v4, s1, v0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_mul_lo_u32 v3, s2, v0 +; GFX6-NEXT: v_mul_lo_u32 v3, s0, v0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 @@ -10116,8 +10120,11 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GFX6-NEXT: s_add_u32 s0, s4, s12 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: s_addc_u32 s1, s5, s12 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], s[0:1], s[12:13] ; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1 ; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 ; GFX6-NEXT: v_mul_hi_u32 v4, s4, v1 @@ -10298,19 +10305,19 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX9-LABEL: srem_v2i64_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[2:3], 0x1000, s8 +; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s8 ; GFX9-NEXT: s_lshl_b64 s[10:11], 0x1000, s10 -; GFX9-NEXT: s_ashr_i32 s8, s3, 31 -; GFX9-NEXT: s_add_u32 s2, s2, s8 +; GFX9-NEXT: s_ashr_i32 s8, s1, 31 +; GFX9-NEXT: s_add_u32 s0, s0, s8 ; GFX9-NEXT: s_mov_b32 s9, s8 -; GFX9-NEXT: s_addc_u32 s3, s3, s8 -; GFX9-NEXT: s_xor_b64 s[12:13], s[2:3], s[8:9] +; GFX9-NEXT: s_addc_u32 s1, s1, s8 +; GFX9-NEXT: s_xor_b64 s[12:13], s[0:1], s[8:9] ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s12 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s13 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 ; GFX9-NEXT: s_sub_u32 s0, 0, s12 ; GFX9-NEXT: s_subb_u32 s1, 0, s13 ; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 diff --git a/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll b/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll index 2c69ae58f0e61..417d38990505b 100644 --- a/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll +++ b/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll @@ -9,7 +9,7 @@ declare i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr, i32) define amdgpu_kernel void @flat_atomic_cond_sub_no_rtn_u32(ptr %addr, i32 %in) { ; GFX12-SDAG-LABEL: flat_atomic_cond_sub_no_rtn_u32: ; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2 @@ -18,7 +18,7 @@ define amdgpu_kernel void @flat_atomic_cond_sub_no_rtn_u32(ptr %addr, i32 %in) { ; ; GFX12-GISEL-LABEL: flat_atomic_cond_sub_no_rtn_u32: ; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1 @@ -33,7 +33,7 @@ entry: define amdgpu_kernel void @flat_atomic_cond_sub_no_rtn_u32_forced(ptr %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" { ; GFX12-SDAG-LABEL: flat_atomic_cond_sub_no_rtn_u32_forced: ; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2 @@ -42,7 +42,7 @@ define amdgpu_kernel void @flat_atomic_cond_sub_no_rtn_u32_forced(ptr %addr, i32 ; ; GFX12-GISEL-LABEL: flat_atomic_cond_sub_no_rtn_u32_forced: ; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1 @@ -58,8 +58,8 @@ define amdgpu_kernel void @flat_atomic_cond_sub_rtn_u32(ptr %addr, i32 %in, ptr ; GFX12-SDAG-LABEL: flat_atomic_cond_sub_rtn_u32: ; GFX12-SDAG: ; %bb.0: ; %entry ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-SDAG-NEXT: s_load_b96 s[4:6], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s6 @@ -72,8 +72,8 @@ define amdgpu_kernel void @flat_atomic_cond_sub_rtn_u32(ptr %addr, i32 %in, ptr ; GFX12-GISEL-LABEL: flat_atomic_cond_sub_rtn_u32: ; GFX12-GISEL: ; %bb.0: ; %entry ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-GISEL-NEXT: s_load_b96 s[4:6], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v1, s5 @@ -92,7 +92,7 @@ entry: define amdgpu_kernel void @global_atomic_cond_sub_no_rtn_u32(ptr addrspace(1) %addr, i32 %in) { ; GFX12-SDAG-LABEL: global_atomic_cond_sub_no_rtn_u32: ; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v0, v1, s[0:1] offset:-16 th:TH_ATOMIC_RETURN @@ -100,7 +100,7 @@ define amdgpu_kernel void @global_atomic_cond_sub_no_rtn_u32(ptr addrspace(1) %a ; ; GFX12-GISEL-LABEL: global_atomic_cond_sub_no_rtn_u32: ; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v1, v0, s[0:1] offset:-16 th:TH_ATOMIC_RETURN @@ -114,7 +114,7 @@ entry: define amdgpu_kernel void @global_atomic_cond_sub_no_rtn_u32_forced(ptr addrspace(1) %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" { ; GFX12-SDAG-LABEL: global_atomic_cond_sub_no_rtn_u32_forced: ; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v1, s[0:1] offset:-16 @@ -124,7 +124,7 @@ define amdgpu_kernel void @global_atomic_cond_sub_no_rtn_u32_forced(ptr addrspac ; ; GFX12-GISEL-LABEL: global_atomic_cond_sub_no_rtn_u32_forced: ; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v1, v0, s[0:1] offset:-16 @@ -140,11 +140,11 @@ entry: define amdgpu_kernel void @global_atomic_cond_sub_rtn_u32(ptr addrspace(1) %addr, i32 %in, ptr addrspace(1) %use) { ; GFX12-SDAG-LABEL: global_atomic_cond_sub_rtn_u32: ; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b96 s[4:6], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6 ; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v1, v0, v1, s[4:5] offset:16 th:TH_ATOMIC_RETURN ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] @@ -155,8 +155,8 @@ define amdgpu_kernel void @global_atomic_cond_sub_rtn_u32(ptr addrspace(1) %addr ; GFX12-GISEL-LABEL: global_atomic_cond_sub_rtn_u32: ; GFX12-GISEL: ; %bb.0: ; %entry ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-GISEL-NEXT: s_load_b96 s[4:6], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v1, v0, s[4:5] offset:16 th:TH_ATOMIC_RETURN @@ -175,7 +175,7 @@ entry: define amdgpu_kernel void @ds_cond_sub_no_rtn_u32(ptr addrspace(3) %addr, i32 %in) { ; GFX12-SDAG-LABEL: ds_cond_sub_no_rtn_u32: ; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, -16 ; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -185,7 +185,7 @@ define amdgpu_kernel void @ds_cond_sub_no_rtn_u32(ptr addrspace(3) %addr, i32 %i ; ; GFX12-GISEL-LABEL: ds_cond_sub_no_rtn_u32: ; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, -16 ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -201,7 +201,7 @@ entry: define amdgpu_kernel void @ds_cond_sub_no_rtn_u32_forced(ptr addrspace(3) %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" { ; GFX12-SDAG-LABEL: ds_cond_sub_no_rtn_u32_forced: ; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, -16 ; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -211,7 +211,7 @@ define amdgpu_kernel void @ds_cond_sub_no_rtn_u32_forced(ptr addrspace(3) %addr, ; ; GFX12-GISEL-LABEL: ds_cond_sub_no_rtn_u32_forced: ; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, -16 ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -227,7 +227,7 @@ entry: define amdgpu_kernel void @ds_cond_sub_rtn_u32(ptr addrspace(3) %addr, i32 %in, ptr addrspace(3) %use) { ; GFX12-SDAG-LABEL: ds_cond_sub_rtn_u32: ; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-SDAG-NEXT: ds_cond_sub_rtn_u32 v0, v0, v1 offset:16 @@ -238,7 +238,7 @@ define amdgpu_kernel void @ds_cond_sub_rtn_u32(ptr addrspace(3) %addr, i32 %in, ; ; GFX12-GISEL-LABEL: ds_cond_sub_rtn_u32: ; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX12-GISEL-NEXT: ds_cond_sub_rtn_u32 v0, v1, v0 offset:16 diff --git a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll index af4116bd6aae5..f54ea615ca664 100644 --- a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll +++ b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @v_ubfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; SI-LABEL: v_ubfe_sub_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -23,7 +23,7 @@ define amdgpu_kernel void @v_ubfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: v_ubfe_sub_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -55,7 +55,7 @@ define amdgpu_kernel void @v_ubfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; SI-LABEL: v_ubfe_sub_multi_use_shl_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -78,7 +78,7 @@ define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, p ; ; VI-LABEL: v_ubfe_sub_multi_use_shl_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -115,7 +115,7 @@ define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, p define amdgpu_kernel void @s_ubfe_sub_i32(ptr addrspace(1) %out, i32 %src, i32 %width) #1 { ; SI-LABEL: s_ubfe_sub_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -131,7 +131,7 @@ define amdgpu_kernel void @s_ubfe_sub_i32(ptr addrspace(1) %out, i32 %src, i32 % ; ; VI-LABEL: s_ubfe_sub_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -155,7 +155,7 @@ define amdgpu_kernel void @s_ubfe_sub_i32(ptr addrspace(1) %out, i32 %src, i32 % define amdgpu_kernel void @s_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, i32 %src, i32 %width) #1 { ; SI-LABEL: s_ubfe_sub_multi_use_shl_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -175,7 +175,7 @@ define amdgpu_kernel void @s_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, i ; ; VI-LABEL: s_ubfe_sub_multi_use_shl_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -203,7 +203,7 @@ define amdgpu_kernel void @s_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, i define amdgpu_kernel void @v_sbfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; SI-LABEL: v_sbfe_sub_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -221,7 +221,7 @@ define amdgpu_kernel void @v_sbfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: v_sbfe_sub_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -253,7 +253,7 @@ define amdgpu_kernel void @v_sbfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @v_sbfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; SI-LABEL: v_sbfe_sub_multi_use_shl_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -276,7 +276,7 @@ define amdgpu_kernel void @v_sbfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, p ; ; VI-LABEL: v_sbfe_sub_multi_use_shl_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -313,7 +313,7 @@ define amdgpu_kernel void @v_sbfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, p define amdgpu_kernel void @s_sbfe_sub_i32(ptr addrspace(1) %out, i32 %src, i32 %width) #1 { ; SI-LABEL: s_sbfe_sub_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -329,7 +329,7 @@ define amdgpu_kernel void @s_sbfe_sub_i32(ptr addrspace(1) %out, i32 %src, i32 % ; ; VI-LABEL: s_sbfe_sub_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -353,7 +353,7 @@ define amdgpu_kernel void @s_sbfe_sub_i32(ptr addrspace(1) %out, i32 %src, i32 % define amdgpu_kernel void @s_sbfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, i32 %src, i32 %width) #1 { ; SI-LABEL: s_sbfe_sub_multi_use_shl_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -373,7 +373,7 @@ define amdgpu_kernel void @s_sbfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, i ; ; VI-LABEL: s_sbfe_sub_multi_use_shl_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -401,8 +401,8 @@ define amdgpu_kernel void @s_sbfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, i define amdgpu_kernel void @s_sbfe_or_shl_shl_uniform_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: s_sbfe_or_shl_shl_uniform_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s2, s[6:7], 0x0 ; SI-NEXT: s_load_dword s0, s[0:1], 0x0 @@ -417,8 +417,8 @@ define amdgpu_kernel void @s_sbfe_or_shl_shl_uniform_i32(ptr addrspace(1) %out, ; ; VI-LABEL: s_sbfe_or_shl_shl_uniform_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[6:7], 0x0 ; VI-NEXT: s_load_dword s0, s[0:1], 0x0 @@ -444,8 +444,8 @@ define amdgpu_kernel void @s_sbfe_or_shl_shl_uniform_i32(ptr addrspace(1) %out, define amdgpu_kernel void @s_sbfe_or_shl_shl_nonuniform_i32(ptr addrspace(1) %out, ptr addrspace(1) %x, ptr addrspace(1) %y) { ; SI-LABEL: s_sbfe_or_shl_shl_nonuniform_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s2, s[6:7], 0x0 ; SI-NEXT: s_load_dword s0, s[0:1], 0x0 @@ -462,8 +462,8 @@ define amdgpu_kernel void @s_sbfe_or_shl_shl_nonuniform_i32(ptr addrspace(1) %ou ; ; VI-LABEL: s_sbfe_or_shl_shl_nonuniform_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[6:7], 0x0 ; VI-NEXT: s_load_dword s0, s[0:1], 0x0 @@ -491,8 +491,8 @@ define amdgpu_kernel void @s_sbfe_or_shl_shl_nonuniform_i32(ptr addrspace(1) %ou define amdgpu_kernel void @s_sbfe_or_shl_shl_toosmall_i32(ptr addrspace(1) %out, ptr addrspace(1) %x, ptr addrspace(1) %y) { ; SI-LABEL: s_sbfe_or_shl_shl_toosmall_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s2, s[6:7], 0x0 ; SI-NEXT: s_load_dword s0, s[0:1], 0x0 @@ -509,8 +509,8 @@ define amdgpu_kernel void @s_sbfe_or_shl_shl_toosmall_i32(ptr addrspace(1) %out, ; ; VI-LABEL: s_sbfe_or_shl_shl_toosmall_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[6:7], 0x0 ; VI-NEXT: s_load_dword s0, s[0:1], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/bfm.ll b/llvm/test/CodeGen/AMDGPU/bfm.ll index f8bd44b7c98f5..2e64db12ef564 100644 --- a/llvm/test/CodeGen/AMDGPU/bfm.ll +++ b/llvm/test/CodeGen/AMDGPU/bfm.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @s_bfm_pattern(ptr addrspace(1) %out, i32 %x, i32 %y) #0 { ; SI-LABEL: s_bfm_pattern: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bfm_b32 s2, s2, s3 @@ -18,7 +18,7 @@ define amdgpu_kernel void @s_bfm_pattern(ptr addrspace(1) %out, i32 %x, i32 %y) ; ; VI-LABEL: s_bfm_pattern: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bfm_b32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -36,11 +36,11 @@ define amdgpu_kernel void @s_bfm_pattern(ptr addrspace(1) %out, i32 %x, i32 %y) define amdgpu_kernel void @s_bfm_pattern_simple(ptr addrspace(1) %out, i32 %x) #0 { ; SI-LABEL: s_bfm_pattern_simple: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfm_b32 s4, s2, 0 +; SI-NEXT: s_bfm_b32 s4, s4, 0 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -48,10 +48,10 @@ define amdgpu_kernel void @s_bfm_pattern_simple(ptr addrspace(1) %out, i32 %x) # ; ; VI-LABEL: s_bfm_pattern_simple: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bfm_b32 s2, s2, 0 +; VI-NEXT: s_bfm_b32 s2, s4, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 diff --git a/llvm/test/CodeGen/AMDGPU/bitreverse.ll b/llvm/test/CodeGen/AMDGPU/bitreverse.ll index 64555f14a55cc..6f52da2631b8a 100644 --- a/llvm/test/CodeGen/AMDGPU/bitreverse.ll +++ b/llvm/test/CodeGen/AMDGPU/bitreverse.ll @@ -21,8 +21,8 @@ declare <4 x i64> @llvm.bitreverse.v4i64(<4 x i64>) #1 define amdgpu_kernel void @s_brev_i16(ptr addrspace(1) noalias %out, i16 %val) #0 { ; SI-LABEL: s_brev_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -34,8 +34,8 @@ define amdgpu_kernel void @s_brev_i16(ptr addrspace(1) noalias %out, i16 %val) # ; ; FLAT-LABEL: s_brev_i16: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dword s4, s[0:1], 0x2c -; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; FLAT-NEXT: s_load_dword s4, s[2:3], 0x2c +; FLAT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; FLAT-NEXT: s_mov_b32 s3, 0xf000 ; FLAT-NEXT: s_mov_b32 s2, -1 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) @@ -47,10 +47,10 @@ define amdgpu_kernel void @s_brev_i16(ptr addrspace(1) noalias %out, i16 %val) # ; ; GISEL-LABEL: s_brev_i16: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: s_and_b32 s2, s2, 0xffff +; GISEL-NEXT: s_and_b32 s2, s4, 0xffff ; GISEL-NEXT: s_brev_b32 s2, s2 ; GISEL-NEXT: s_lshr_b32 s2, s2, 16 ; GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -62,10 +62,10 @@ define amdgpu_kernel void @s_brev_i16(ptr addrspace(1) noalias %out, i16 %val) # ; GFX11-FLAT-LABEL: s_brev_i16: ; GFX11-FLAT: ; %bb.0: ; GFX11-FLAT-NEXT: s_clause 0x1 -; GFX11-FLAT-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-FLAT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-FLAT-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-FLAT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FLAT-NEXT: s_brev_b32 s2, s2 +; GFX11-FLAT-NEXT: s_brev_b32 s2, s4 ; GFX11-FLAT-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FLAT-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-FLAT-NEXT: global_store_d16_hi_b16 v0, v1, s[0:1] @@ -76,11 +76,11 @@ define amdgpu_kernel void @s_brev_i16(ptr addrspace(1) noalias %out, i16 %val) # ; GFX11-GISEL-LABEL: s_brev_i16: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-GISEL-NEXT: s_and_b32 s2, s4, 0xffff ; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: s_brev_b32 s2, s2 ; GFX11-GISEL-NEXT: s_lshr_b32 s2, s2, 16 @@ -98,7 +98,7 @@ define amdgpu_kernel void @s_brev_i16(ptr addrspace(1) noalias %out, i16 %val) # define amdgpu_kernel void @v_brev_i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) #0 { ; SI-LABEL: v_brev_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -117,7 +117,7 @@ define amdgpu_kernel void @v_brev_i16(ptr addrspace(1) noalias %out, ptr addrspa ; ; FLAT-LABEL: v_brev_i16: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; FLAT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; FLAT-NEXT: s_mov_b32 s7, 0xf000 ; FLAT-NEXT: s_mov_b32 s6, -1 ; FLAT-NEXT: s_mov_b32 s10, s6 @@ -136,7 +136,7 @@ define amdgpu_kernel void @v_brev_i16(ptr addrspace(1) noalias %out, ptr addrspa ; ; GISEL-LABEL: v_brev_i16: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -151,7 +151,7 @@ define amdgpu_kernel void @v_brev_i16(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX11-FLAT-LABEL: v_brev_i16: ; GFX11-FLAT: ; %bb.0: -; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-FLAT-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FLAT-NEXT: s_mov_b32 s6, -1 ; GFX11-FLAT-NEXT: v_mov_b32_e32 v1, 0 @@ -168,7 +168,7 @@ define amdgpu_kernel void @v_brev_i16(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX11-GISEL-LABEL: v_brev_i16: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_u16 v1, v0, s[2:3] @@ -187,8 +187,8 @@ define amdgpu_kernel void @v_brev_i16(ptr addrspace(1) noalias %out, ptr addrspa define amdgpu_kernel void @s_brev_i32(ptr addrspace(1) noalias %out, i32 %val) #0 { ; SI-LABEL: s_brev_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -199,8 +199,8 @@ define amdgpu_kernel void @s_brev_i32(ptr addrspace(1) noalias %out, i32 %val) # ; ; FLAT-LABEL: s_brev_i32: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dword s4, s[0:1], 0x2c -; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; FLAT-NEXT: s_load_dword s4, s[2:3], 0x2c +; FLAT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; FLAT-NEXT: s_mov_b32 s3, 0xf000 ; FLAT-NEXT: s_mov_b32 s2, -1 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) @@ -211,10 +211,10 @@ define amdgpu_kernel void @s_brev_i32(ptr addrspace(1) noalias %out, i32 %val) # ; ; GISEL-LABEL: s_brev_i32: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: s_brev_b32 s2, s2 +; GISEL-NEXT: s_brev_b32 s2, s4 ; GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-NEXT: v_mov_b32_e32 v2, s2 ; GISEL-NEXT: v_mov_b32_e32 v1, s1 @@ -224,11 +224,11 @@ define amdgpu_kernel void @s_brev_i32(ptr addrspace(1) noalias %out, i32 %val) # ; GFX11-FLAT-LABEL: s_brev_i32: ; GFX11-FLAT: ; %bb.0: ; GFX11-FLAT-NEXT: s_clause 0x1 -; GFX11-FLAT-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-FLAT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-FLAT-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-FLAT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-FLAT-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FLAT-NEXT: s_brev_b32 s2, s2 +; GFX11-FLAT-NEXT: s_brev_b32 s2, s4 ; GFX11-FLAT-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FLAT-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-FLAT-NEXT: s_mov_b32 s2, -1 @@ -240,11 +240,11 @@ define amdgpu_kernel void @s_brev_i32(ptr addrspace(1) noalias %out, i32 %val) # ; GFX11-GISEL-LABEL: s_brev_i32: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_brev_b32 s2, s2 +; GFX11-GISEL-NEXT: s_brev_b32 s2, s4 ; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] @@ -259,7 +259,7 @@ define amdgpu_kernel void @s_brev_i32(ptr addrspace(1) noalias %out, i32 %val) # define amdgpu_kernel void @v_brev_i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) #0 { ; SI-LABEL: v_brev_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -278,7 +278,7 @@ define amdgpu_kernel void @v_brev_i32(ptr addrspace(1) noalias %out, ptr addrspa ; ; FLAT-LABEL: v_brev_i32: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; FLAT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) ; FLAT-NEXT: v_mov_b32_e32 v1, s3 @@ -294,7 +294,7 @@ define amdgpu_kernel void @v_brev_i32(ptr addrspace(1) noalias %out, ptr addrspa ; ; GISEL-LABEL: v_brev_i32: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -311,7 +311,9 @@ define amdgpu_kernel void @v_brev_i32(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX11-FLAT-LABEL: v_brev_i32: ; GFX11-FLAT: ; %bb.0: -; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FLAT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FLAT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLAT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLAT-NEXT: global_load_b32 v0, v0, s[2:3] @@ -326,8 +328,10 @@ define amdgpu_kernel void @v_brev_i32(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX11-GISEL-LABEL: v_brev_i32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) @@ -347,7 +351,7 @@ define amdgpu_kernel void @v_brev_i32(ptr addrspace(1) noalias %out, ptr addrspa define amdgpu_kernel void @s_brev_v2i32(ptr addrspace(1) noalias %out, <2 x i32> %val) #0 { ; SI-LABEL: s_brev_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -362,7 +366,7 @@ define amdgpu_kernel void @s_brev_v2i32(ptr addrspace(1) noalias %out, <2 x i32> ; ; FLAT-LABEL: s_brev_v2i32: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; FLAT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; FLAT-NEXT: s_mov_b32 s7, 0xf000 ; FLAT-NEXT: s_mov_b32 s6, -1 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) @@ -377,7 +381,7 @@ define amdgpu_kernel void @s_brev_v2i32(ptr addrspace(1) noalias %out, <2 x i32> ; ; GISEL-LABEL: s_brev_v2i32: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: s_brev_b32 s2, s2 ; GISEL-NEXT: s_brev_b32 s3, s3 @@ -390,7 +394,7 @@ define amdgpu_kernel void @s_brev_v2i32(ptr addrspace(1) noalias %out, <2 x i32> ; ; GFX11-FLAT-LABEL: s_brev_v2i32: ; GFX11-FLAT: ; %bb.0: -; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-FLAT-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FLAT-NEXT: s_mov_b32 s6, -1 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0) @@ -407,7 +411,7 @@ define amdgpu_kernel void @s_brev_v2i32(ptr addrspace(1) noalias %out, <2 x i32> ; ; GFX11-GISEL-LABEL: s_brev_v2i32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_brev_b32 s2, s2 @@ -426,7 +430,7 @@ define amdgpu_kernel void @s_brev_v2i32(ptr addrspace(1) noalias %out, <2 x i32> define amdgpu_kernel void @v_brev_v2i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) #0 { ; SI-LABEL: v_brev_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -446,7 +450,7 @@ define amdgpu_kernel void @v_brev_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; FLAT-LABEL: v_brev_v2i32: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; FLAT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) ; FLAT-NEXT: v_mov_b32_e32 v1, s3 @@ -463,7 +467,7 @@ define amdgpu_kernel void @v_brev_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GISEL-LABEL: v_brev_v2i32: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -481,7 +485,9 @@ define amdgpu_kernel void @v_brev_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX11-FLAT-LABEL: v_brev_v2i32: ; GFX11-FLAT: ; %bb.0: -; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FLAT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FLAT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLAT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLAT-NEXT: global_load_b64 v[0:1], v0, s[2:3] @@ -497,9 +503,11 @@ define amdgpu_kernel void @v_brev_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX11-GISEL-LABEL: v_brev_v2i32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b64 v[0:1], v0, s[2:3] ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) @@ -520,7 +528,7 @@ define amdgpu_kernel void @v_brev_v2i32(ptr addrspace(1) noalias %out, ptr addrs define amdgpu_kernel void @s_brev_i64(ptr addrspace(1) noalias %out, i64 %val) #0 { ; SI-LABEL: s_brev_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -534,7 +542,7 @@ define amdgpu_kernel void @s_brev_i64(ptr addrspace(1) noalias %out, i64 %val) # ; ; FLAT-LABEL: s_brev_i64: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; FLAT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; FLAT-NEXT: s_mov_b32 s7, 0xf000 ; FLAT-NEXT: s_mov_b32 s6, -1 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) @@ -548,7 +556,7 @@ define amdgpu_kernel void @s_brev_i64(ptr addrspace(1) noalias %out, i64 %val) # ; ; GISEL-LABEL: s_brev_i64: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: s_brev_b64 s[2:3], s[2:3] ; GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -560,7 +568,7 @@ define amdgpu_kernel void @s_brev_i64(ptr addrspace(1) noalias %out, i64 %val) # ; ; GFX11-FLAT-LABEL: s_brev_i64: ; GFX11-FLAT: ; %bb.0: -; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLAT-NEXT: s_brev_b64 s[4:5], s[2:3] ; GFX11-FLAT-NEXT: s_mov_b32 s3, 0x31016000 @@ -573,7 +581,7 @@ define amdgpu_kernel void @s_brev_i64(ptr addrspace(1) noalias %out, i64 %val) # ; ; GFX11-GISEL-LABEL: s_brev_i64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_brev_b64 s[2:3], s[2:3] @@ -591,7 +599,7 @@ define amdgpu_kernel void @s_brev_i64(ptr addrspace(1) noalias %out, i64 %val) # define amdgpu_kernel void @v_brev_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) #0 { ; SI-LABEL: v_brev_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -611,7 +619,7 @@ define amdgpu_kernel void @v_brev_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; FLAT-LABEL: v_brev_i64: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; FLAT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) ; FLAT-NEXT: v_mov_b32_e32 v1, s3 @@ -628,7 +636,7 @@ define amdgpu_kernel void @v_brev_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; GISEL-LABEL: v_brev_i64: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -646,7 +654,9 @@ define amdgpu_kernel void @v_brev_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX11-FLAT-LABEL: v_brev_i64: ; GFX11-FLAT: ; %bb.0: -; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FLAT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FLAT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLAT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLAT-NEXT: global_load_b64 v[0:1], v0, s[2:3] @@ -662,7 +672,9 @@ define amdgpu_kernel void @v_brev_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX11-GISEL-LABEL: v_brev_i64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b64 v[0:1], v0, s[2:3] @@ -685,8 +697,8 @@ define amdgpu_kernel void @v_brev_i64(ptr addrspace(1) noalias %out, ptr addrspa define amdgpu_kernel void @s_brev_v2i64(ptr addrspace(1) noalias %out, <2 x i64> %val) #0 { ; SI-LABEL: s_brev_v2i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -701,8 +713,8 @@ define amdgpu_kernel void @s_brev_v2i64(ptr addrspace(1) noalias %out, <2 x i64> ; ; FLAT-LABEL: s_brev_v2i64: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; FLAT-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; FLAT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; FLAT-NEXT: s_mov_b32 s3, 0xf000 ; FLAT-NEXT: s_mov_b32 s2, -1 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) @@ -717,8 +729,8 @@ define amdgpu_kernel void @s_brev_v2i64(ptr addrspace(1) noalias %out, <2 x i64> ; ; GISEL-LABEL: s_brev_v2i64: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: s_brev_b64 s[0:1], s[4:5] ; GISEL-NEXT: s_brev_b64 s[2:3], s[6:7] @@ -734,8 +746,8 @@ define amdgpu_kernel void @s_brev_v2i64(ptr addrspace(1) noalias %out, <2 x i64> ; GFX11-FLAT-LABEL: s_brev_v2i64: ; GFX11-FLAT: ; %bb.0: ; GFX11-FLAT-NEXT: s_clause 0x1 -; GFX11-FLAT-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11-FLAT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-FLAT-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX11-FLAT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLAT-NEXT: s_brev_b64 s[2:3], s[4:5] ; GFX11-FLAT-NEXT: s_brev_b64 s[4:5], s[6:7] @@ -751,8 +763,8 @@ define amdgpu_kernel void @s_brev_v2i64(ptr addrspace(1) noalias %out, <2 x i64> ; GFX11-GISEL-LABEL: s_brev_v2i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11-GISEL-NEXT: s_load_b64 s[8:9], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX11-GISEL-NEXT: s_load_b64 s[8:9], s[2:3], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_brev_b64 s[0:1], s[4:5] @@ -771,7 +783,7 @@ define amdgpu_kernel void @s_brev_v2i64(ptr addrspace(1) noalias %out, <2 x i64> define amdgpu_kernel void @v_brev_v2i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) #0 { ; SI-LABEL: v_brev_v2i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -793,7 +805,7 @@ define amdgpu_kernel void @v_brev_v2i64(ptr addrspace(1) noalias %out, ptr addrs ; ; FLAT-LABEL: v_brev_v2i64: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; FLAT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) ; FLAT-NEXT: v_mov_b32_e32 v1, s3 @@ -812,7 +824,7 @@ define amdgpu_kernel void @v_brev_v2i64(ptr addrspace(1) noalias %out, ptr addrs ; ; GISEL-LABEL: v_brev_v2i64: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -832,7 +844,9 @@ define amdgpu_kernel void @v_brev_v2i64(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX11-FLAT-LABEL: v_brev_v2i64: ; GFX11-FLAT: ; %bb.0: -; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FLAT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FLAT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLAT-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLAT-NEXT: global_load_b128 v[0:3], v0, s[2:3] @@ -850,7 +864,9 @@ define amdgpu_kernel void @v_brev_v2i64(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX11-GISEL-LABEL: v_brev_v2i64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b128 v[0:3], v0, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/build_vector.ll b/llvm/test/CodeGen/AMDGPU/build_vector.ll index e914635d6c26f..13c4ff8b2ff30 100644 --- a/llvm/test/CodeGen/AMDGPU/build_vector.ll +++ b/llvm/test/CodeGen/AMDGPU/build_vector.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @build_vector2 (ptr addrspace(1) %out) { ; GFX6-LABEL: build_vector2: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, 5 @@ -19,7 +19,7 @@ define amdgpu_kernel void @build_vector2 (ptr addrspace(1) %out) { ; ; GFX8-LABEL: build_vector2: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v0, 5 ; GFX8-NEXT: v_mov_b32_e32 v1, 6 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -30,7 +30,7 @@ define amdgpu_kernel void @build_vector2 (ptr addrspace(1) %out) { ; ; GFX10-LABEL: build_vector2: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 5 ; GFX10-NEXT: v_mov_b32_e32 v1, 6 @@ -40,7 +40,7 @@ define amdgpu_kernel void @build_vector2 (ptr addrspace(1) %out) { ; ; GFX11-LABEL: build_vector2: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: v_mov_b32_e32 v0, 5 ; GFX11-NEXT: v_mov_b32_e32 v1, 6 @@ -52,7 +52,7 @@ define amdgpu_kernel void @build_vector2 (ptr addrspace(1) %out) { ; ; GFX940-LABEL: build_vector2: ; GFX940: ; %bb.0: ; %entry -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: v_mov_b32_e32 v0, 5 ; GFX940-NEXT: v_mov_b32_e32 v1, 6 @@ -67,7 +67,7 @@ entry: define amdgpu_kernel void @build_vector4 (ptr addrspace(1) %out) { ; GFX6-LABEL: build_vector4: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, 5 @@ -80,7 +80,7 @@ define amdgpu_kernel void @build_vector4 (ptr addrspace(1) %out) { ; ; GFX8-LABEL: build_vector4: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v0, 5 ; GFX8-NEXT: v_mov_b32_e32 v1, 6 ; GFX8-NEXT: v_mov_b32_e32 v2, 7 @@ -93,7 +93,7 @@ define amdgpu_kernel void @build_vector4 (ptr addrspace(1) %out) { ; ; GFX10-LABEL: build_vector4: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 5 ; GFX10-NEXT: v_mov_b32_e32 v1, 6 @@ -105,7 +105,7 @@ define amdgpu_kernel void @build_vector4 (ptr addrspace(1) %out) { ; ; GFX11-LABEL: build_vector4: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: v_mov_b32_e32 v0, 5 ; GFX11-NEXT: v_mov_b32_e32 v1, 6 @@ -119,7 +119,7 @@ define amdgpu_kernel void @build_vector4 (ptr addrspace(1) %out) { ; ; GFX940-LABEL: build_vector4: ; GFX940: ; %bb.0: ; %entry -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v4, 0 ; GFX940-NEXT: v_mov_b32_e32 v0, 5 ; GFX940-NEXT: v_mov_b32_e32 v1, 6 @@ -136,7 +136,7 @@ entry: define amdgpu_kernel void @build_vector_v2i16 (ptr addrspace(1) %out) { ; GFX6-LABEL: build_vector_v2i16: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, 0x60005 @@ -146,7 +146,7 @@ define amdgpu_kernel void @build_vector_v2i16 (ptr addrspace(1) %out) { ; ; GFX8-LABEL: build_vector_v2i16: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x60005 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -156,7 +156,7 @@ define amdgpu_kernel void @build_vector_v2i16 (ptr addrspace(1) %out) { ; ; GFX10-LABEL: build_vector_v2i16: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x60005 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -165,7 +165,7 @@ define amdgpu_kernel void @build_vector_v2i16 (ptr addrspace(1) %out) { ; ; GFX11-LABEL: build_vector_v2i16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0x60005 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -176,7 +176,7 @@ define amdgpu_kernel void @build_vector_v2i16 (ptr addrspace(1) %out) { ; ; GFX940-LABEL: build_vector_v2i16: ; GFX940: ; %bb.0: ; %entry -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NEXT: v_mov_b32_e32 v1, 0x60005 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) @@ -190,8 +190,8 @@ entry: define amdgpu_kernel void @build_vector_v2i16_trunc (ptr addrspace(1) %out, i32 %a) { ; GFX6-LABEL: build_vector_v2i16_trunc: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -201,10 +201,10 @@ define amdgpu_kernel void @build_vector_v2i16_trunc (ptr addrspace(1) %out, i32 ; ; GFX8-LABEL: build_vector_v2i16_trunc: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshr_b32 s2, s2, 16 +; GFX8-NEXT: s_lshr_b32 s2, s4, 16 ; GFX8-NEXT: s_or_b32 s2, s2, 0x50000 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -215,11 +215,11 @@ define amdgpu_kernel void @build_vector_v2i16_trunc (ptr addrspace(1) %out, i32 ; GFX10-LABEL: build_vector_v2i16_trunc: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_lshr_b32 s2, s2, 16 +; GFX10-NEXT: s_lshr_b32 s2, s4, 16 ; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, 5 ; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] @@ -228,11 +228,11 @@ define amdgpu_kernel void @build_vector_v2i16_trunc (ptr addrspace(1) %out, i32 ; GFX11-LABEL: build_vector_v2i16_trunc: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_pack_hl_b32_b16 s2, s2, 5 +; GFX11-NEXT: s_pack_hl_b32_b16 s2, s4, 5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -242,14 +242,14 @@ define amdgpu_kernel void @build_vector_v2i16_trunc (ptr addrspace(1) %out, i32 ; ; GFX940-LABEL: build_vector_v2i16_trunc: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX940-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_lshr_b32 s0, s4, 16 -; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, 5 -; GFX940-NEXT: v_mov_b32_e32 v1, s0 -; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-NEXT: s_lshr_b32 s2, s4, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s2, s2, 5 +; GFX940-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 ; GFX940-NEXT: s_endpgm %srl = lshr i32 %a, 16 %trunc = trunc i32 %srl to i16 @@ -262,7 +262,7 @@ define amdgpu_kernel void @build_vector_v2i16_trunc (ptr addrspace(1) %out, i32 define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out, <4 x i16> %in) { ; GFX6-LABEL: build_v2i32_from_v4i16_shuffle: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -277,7 +277,7 @@ define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out, ; ; GFX8-LABEL: build_v2i32_from_v4i16_shuffle: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshl_b32 s3, s3, 16 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 @@ -290,7 +290,7 @@ define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out, ; ; GFX10-LABEL: build_v2i32_from_v4i16_shuffle: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_lshl_b32 s2, s2, 16 @@ -302,7 +302,7 @@ define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out, ; ; GFX11-LABEL: build_v2i32_from_v4i16_shuffle: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshl_b32 s2, s2, 16 @@ -316,7 +316,7 @@ define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out, ; ; GFX940-LABEL: build_v2i32_from_v4i16_shuffle: ; GFX940: ; %bb.0: ; %entry -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: s_lshl_b32 s0, s7, 16 diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll index 15ebdd70ae881..231d3d97c8f4f 100644 --- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll +++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll @@ -8,7 +8,7 @@ define spir_kernel void @kernel(ptr addrspace(1) %out) { ; SI-LABEL: kernel: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -18,7 +18,7 @@ define spir_kernel void @kernel(ptr addrspace(1) %out) { ; ; VI-LABEL: kernel: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -28,7 +28,7 @@ define spir_kernel void @kernel(ptr addrspace(1) %out) { ; ; GFX11-LABEL: kernel: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v0, s[0:1] @@ -115,21 +115,32 @@ define amdgpu_kernel void @call_coldcc() #0 { ; SI-LABEL: call_coldcc: ; SI: ; %bb.0: ; SI-NEXT: s_mov_b32 s32, 0 -; SI-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; SI-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s11, 0xe8f000 -; SI-NEXT: s_add_u32 s8, s8, s1 -; SI-NEXT: s_addc_u32 s9, s9, 0 -; SI-NEXT: s_getpc_b64 s[0:1] -; SI-NEXT: s_add_u32 s0, s0, coldcc@gotpcrel32@lo+4 -; SI-NEXT: s_addc_u32 s1, s1, coldcc@gotpcrel32@hi+12 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SI-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 +; SI-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; SI-NEXT: s_mov_b32 s22, -1 +; SI-NEXT: s_mov_b32 s23, 0xe8f000 +; SI-NEXT: s_add_u32 s20, s20, s9 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_mov_b32 s14, s8 +; SI-NEXT: s_mov_b64 s[10:11], s[4:5] +; SI-NEXT: s_add_u32 s8, s2, 36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; SI-NEXT: s_addc_u32 s9, s3, 0 +; SI-NEXT: s_getpc_b64 s[2:3] +; SI-NEXT: s_add_u32 s2, s2, coldcc@gotpcrel32@lo+4 +; SI-NEXT: s_addc_u32 s3, s3, coldcc@gotpcrel32@hi+12 +; SI-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v31, v0, v2 ; SI-NEXT: v_mov_b32_e32 v0, 1.0 -; SI-NEXT: s_mov_b64 s[0:1], s[8:9] -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: s_mov_b64 s[4:5], s[0:1] +; SI-NEXT: s_mov_b32 s12, s6 +; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: s_mov_b64 s[0:1], s[20:21] +; SI-NEXT: s_mov_b64 s[2:3], s[22:23] ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; SI-NEXT: s_swappc_b64 s[30:31], s[16:17] ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -141,31 +152,49 @@ define amdgpu_kernel void @call_coldcc() #0 { ; VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; VI-NEXT: s_mov_b32 s90, -1 ; VI-NEXT: s_mov_b32 s91, 0xe80000 -; VI-NEXT: s_add_u32 s88, s88, s1 +; VI-NEXT: s_add_u32 s88, s88, s9 ; VI-NEXT: s_addc_u32 s89, s89, 0 -; VI-NEXT: s_getpc_b64 s[0:1] -; VI-NEXT: s_add_u32 s0, s0, coldcc@gotpcrel32@lo+4 -; VI-NEXT: s_addc_u32 s1, s1, coldcc@gotpcrel32@hi+12 -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; VI-NEXT: s_mov_b32 s14, s8 +; VI-NEXT: s_add_u32 s8, s2, 36 +; VI-NEXT: s_addc_u32 s9, s3, 0 +; VI-NEXT: s_getpc_b64 s[2:3] +; VI-NEXT: s_add_u32 s2, s2, coldcc@gotpcrel32@lo+4 +; VI-NEXT: s_addc_u32 s3, s3, coldcc@gotpcrel32@hi+12 +; VI-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; VI-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; VI-NEXT: s_mov_b64 s[10:11], s[4:5] +; VI-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: s_mov_b64 s[4:5], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[88:89] +; VI-NEXT: v_or_b32_e32 v31, v0, v2 +; VI-NEXT: s_mov_b32 s12, s6 +; VI-NEXT: s_mov_b32 s13, s7 ; VI-NEXT: s_mov_b64 s[2:3], s[90:91] ; VI-NEXT: v_mov_b32_e32 v0, 1.0 ; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_swappc_b64 s[30:31], s[16:17] ; VI-NEXT: flat_store_dword v[0:1], v0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: call_coldcc: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, coldcc@gotpcrel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, coldcc@gotpcrel32@hi+12 -; GFX11-NEXT: v_mov_b32_e32 v0, 1.0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_add_u32 s8, s2, 36 +; GFX11-NEXT: s_addc_u32 s9, s3, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, coldcc@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, coldcc@gotpcrel32@hi+12 +; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 1.0 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-NEXT: s_mov_b32 s12, s13 +; GFX11-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX11-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX11-NEXT: s_mov_b32 s13, s14 +; GFX11-NEXT: s_mov_b32 s14, s15 ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: global_store_b32 v[0:1], v0, off ; GFX11-NEXT: s_endpgm %val = call float @coldcc(float 1.0) @@ -177,21 +206,32 @@ define amdgpu_kernel void @call_fastcc() #0 { ; SI-LABEL: call_fastcc: ; SI: ; %bb.0: ; SI-NEXT: s_mov_b32 s32, 0 -; SI-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; SI-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s11, 0xe8f000 -; SI-NEXT: s_add_u32 s8, s8, s1 -; SI-NEXT: s_addc_u32 s9, s9, 0 -; SI-NEXT: s_getpc_b64 s[0:1] -; SI-NEXT: s_add_u32 s0, s0, fastcc@gotpcrel32@lo+4 -; SI-NEXT: s_addc_u32 s1, s1, fastcc@gotpcrel32@hi+12 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SI-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 +; SI-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; SI-NEXT: s_mov_b32 s22, -1 +; SI-NEXT: s_mov_b32 s23, 0xe8f000 +; SI-NEXT: s_add_u32 s20, s20, s9 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_mov_b32 s14, s8 +; SI-NEXT: s_mov_b64 s[10:11], s[4:5] +; SI-NEXT: s_add_u32 s8, s2, 36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; SI-NEXT: s_addc_u32 s9, s3, 0 +; SI-NEXT: s_getpc_b64 s[2:3] +; SI-NEXT: s_add_u32 s2, s2, fastcc@gotpcrel32@lo+4 +; SI-NEXT: s_addc_u32 s3, s3, fastcc@gotpcrel32@hi+12 +; SI-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v31, v0, v2 ; SI-NEXT: v_mov_b32_e32 v0, 1.0 -; SI-NEXT: s_mov_b64 s[0:1], s[8:9] -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: s_mov_b64 s[4:5], s[0:1] +; SI-NEXT: s_mov_b32 s12, s6 +; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: s_mov_b64 s[0:1], s[20:21] +; SI-NEXT: s_mov_b64 s[2:3], s[22:23] ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; SI-NEXT: s_swappc_b64 s[30:31], s[16:17] ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -203,31 +243,49 @@ define amdgpu_kernel void @call_fastcc() #0 { ; VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; VI-NEXT: s_mov_b32 s90, -1 ; VI-NEXT: s_mov_b32 s91, 0xe80000 -; VI-NEXT: s_add_u32 s88, s88, s1 +; VI-NEXT: s_add_u32 s88, s88, s9 ; VI-NEXT: s_addc_u32 s89, s89, 0 -; VI-NEXT: s_getpc_b64 s[0:1] -; VI-NEXT: s_add_u32 s0, s0, fastcc@gotpcrel32@lo+4 -; VI-NEXT: s_addc_u32 s1, s1, fastcc@gotpcrel32@hi+12 -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; VI-NEXT: s_mov_b32 s14, s8 +; VI-NEXT: s_add_u32 s8, s2, 36 +; VI-NEXT: s_addc_u32 s9, s3, 0 +; VI-NEXT: s_getpc_b64 s[2:3] +; VI-NEXT: s_add_u32 s2, s2, fastcc@gotpcrel32@lo+4 +; VI-NEXT: s_addc_u32 s3, s3, fastcc@gotpcrel32@hi+12 +; VI-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; VI-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; VI-NEXT: s_mov_b64 s[10:11], s[4:5] +; VI-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: s_mov_b64 s[4:5], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[88:89] +; VI-NEXT: v_or_b32_e32 v31, v0, v2 +; VI-NEXT: s_mov_b32 s12, s6 +; VI-NEXT: s_mov_b32 s13, s7 ; VI-NEXT: s_mov_b64 s[2:3], s[90:91] ; VI-NEXT: v_mov_b32_e32 v0, 1.0 ; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_swappc_b64 s[30:31], s[16:17] ; VI-NEXT: flat_store_dword v[0:1], v0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: call_fastcc: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, fastcc@gotpcrel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, fastcc@gotpcrel32@hi+12 -; GFX11-NEXT: v_mov_b32_e32 v0, 1.0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_add_u32 s8, s2, 36 +; GFX11-NEXT: s_addc_u32 s9, s3, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, fastcc@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, fastcc@gotpcrel32@hi+12 +; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 1.0 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-NEXT: s_mov_b32 s12, s13 +; GFX11-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX11-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX11-NEXT: s_mov_b32 s13, s14 +; GFX11-NEXT: s_mov_b32 s14, s15 ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: global_store_b32 v[0:1], v0, off ; GFX11-NEXT: s_endpgm %val = call float @fastcc(float 1.0) @@ -954,7 +1012,7 @@ define amdgpu_ps i16 @ret_ps_mesa_i16() { define amdgpu_kernel void @amd_kernel_i8(i8 %arg0) { ; SI-LABEL: amd_kernel_i8: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s0, s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_add_i32 s0, s0, s0 @@ -965,7 +1023,7 @@ define amdgpu_kernel void @amd_kernel_i8(i8 %arg0) { ; ; VI-LABEL: amd_kernel_i8: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s0, s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_i32 s0, s0, s0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -974,7 +1032,7 @@ define amdgpu_kernel void @amd_kernel_i8(i8 %arg0) { ; ; GFX11-LABEL: amd_kernel_i8: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_i32 s0, s0, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -992,7 +1050,7 @@ entry: define amdgpu_kernel void @amd_kernel_v2i8(<2 x i8> %arg0) { ; SI-LABEL: amd_kernel_v2i8: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s1, s[0:1], 0x9 +; SI-NEXT: s_load_dword s1, s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s0, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1010,7 +1068,7 @@ define amdgpu_kernel void @amd_kernel_v2i8(<2 x i8> %arg0) { ; ; VI-LABEL: amd_kernel_v2i8: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s0, s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_add_i32 s0, s0, s0 @@ -1024,7 +1082,7 @@ define amdgpu_kernel void @amd_kernel_v2i8(<2 x i8> %arg0) { ; ; GFX11-LABEL: amd_kernel_v2i8: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_lshrrev_b16 v0, 8, s0 ; GFX11-NEXT: v_add_nc_u16 v1, s0, s0 @@ -1049,7 +1107,7 @@ entry: define amdgpu_kernel void @amd_kernel_v4i8(<4 x i8> %arg0) { ; SI-LABEL: amd_kernel_v4i8: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s1, s[0:1], 0x9 +; SI-NEXT: s_load_dword s1, s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s0, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1077,7 +1135,7 @@ define amdgpu_kernel void @amd_kernel_v4i8(<4 x i8> %arg0) { ; ; VI-LABEL: amd_kernel_v4i8: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s0, s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s1, s0, 24 ; VI-NEXT: s_lshr_b32 s2, s0, 16 @@ -1099,7 +1157,7 @@ define amdgpu_kernel void @amd_kernel_v4i8(<4 x i8> %arg0) { ; ; GFX11-LABEL: amd_kernel_v4i8: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_lshrrev_b16 v0, 8, s0 ; GFX11-NEXT: s_lshr_b32 s1, s0, 16 @@ -1136,7 +1194,7 @@ entry: define amdgpu_kernel void @amd_kernel_v3i8(<3 x i8> %arg0) { ; SI-LABEL: amd_kernel_v3i8: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s1, 0 ; SI-NEXT: s_mov_b32 s0, 2 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -1160,7 +1218,7 @@ define amdgpu_kernel void @amd_kernel_v3i8(<3 x i8> %arg0) { ; ; VI-LABEL: amd_kernel_v3i8: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s0, s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: v_mov_b32_e32 v3, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1180,7 +1238,7 @@ define amdgpu_kernel void @amd_kernel_v3i8(<3 x i8> %arg0) { ; ; GFX11-LABEL: amd_kernel_v3i8: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1212,7 +1270,7 @@ entry: define amdgpu_kernel void @amd_kernel_v5i8(<5 x i8> %arg0) { ; SI-LABEL: amd_kernel_v5i8: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s1, 0 ; SI-NEXT: s_mov_b32 s0, 4 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -1245,7 +1303,7 @@ define amdgpu_kernel void @amd_kernel_v5i8(<5 x i8> %arg0) { ; ; VI-LABEL: amd_kernel_v5i8: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s2, s0, 24 ; VI-NEXT: s_lshr_b32 s3, s0, 16 @@ -1273,7 +1331,7 @@ define amdgpu_kernel void @amd_kernel_v5i8(<5 x i8> %arg0) { ; ; GFX11-LABEL: amd_kernel_v5i8: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_lshrrev_b16 v0, 8, s0 ; GFX11-NEXT: s_lshr_b32 s2, s0, 16 @@ -1312,7 +1370,7 @@ entry: define amdgpu_kernel void @amd_kernel_v8i8(<8 x i8> %arg0) { ; SI-LABEL: amd_kernel_v8i8: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s0, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1357,7 +1415,7 @@ define amdgpu_kernel void @amd_kernel_v8i8(<8 x i8> %arg0) { ; ; VI-LABEL: amd_kernel_v8i8: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s2, s1, 24 ; VI-NEXT: s_lshr_b32 s3, s1, 16 @@ -1392,7 +1450,7 @@ define amdgpu_kernel void @amd_kernel_v8i8(<8 x i8> %arg0) { ; ; GFX11-LABEL: amd_kernel_v8i8: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_lshrrev_b16 v0, 8, s0 ; GFX11-NEXT: v_lshrrev_b16 v1, 8, s1 @@ -1445,7 +1503,7 @@ entry: define amdgpu_kernel void @amd_kernel_v16i8(<16 x i8> %arg0) { ; SI-LABEL: amd_kernel_v16i8: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s4, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1524,7 +1582,7 @@ define amdgpu_kernel void @amd_kernel_v16i8(<16 x i8> %arg0) { ; ; VI-LABEL: amd_kernel_v16i8: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s4, s3, 24 ; VI-NEXT: s_lshr_b32 s5, s3, 16 @@ -1585,7 +1643,7 @@ define amdgpu_kernel void @amd_kernel_v16i8(<16 x i8> %arg0) { ; ; GFX11-LABEL: amd_kernel_v16i8: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshr_b32 s10, s3, 16 ; GFX11-NEXT: s_lshr_b32 s11, s3, 24 @@ -1666,7 +1724,7 @@ entry: define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) { ; SI-LABEL: amd_kernel_v32i8: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s9, 0 ; SI-NEXT: s_mov_b32 s8, 16 ; SI-NEXT: s_mov_b32 s11, 0xf000 @@ -1816,7 +1874,7 @@ define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) { ; ; VI-LABEL: amd_kernel_v32i8: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v10, 0 ; VI-NEXT: v_mov_b32_e32 v11, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1932,7 +1990,7 @@ define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) { ; ; GFX11-LABEL: amd_kernel_v32i8: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_lshrrev_b16 v3, 8, s2 ; GFX11-NEXT: v_lshrrev_b16 v7, 8, s3 diff --git a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll index d511bb1f4a257..bada3d904fbe3 100644 --- a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll +++ b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll @@ -30,7 +30,7 @@ define amdgpu_kernel void @cluster_load_cluster_store(ptr noalias %lb, ptr noalias %sb) { ; GFX9-LABEL: cluster_load_cluster_store: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -49,7 +49,7 @@ define amdgpu_kernel void @cluster_load_cluster_store(ptr noalias %lb, ptr noali ; ; GFX10-LABEL: cluster_load_cluster_store: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s4, 8 ; GFX10-NEXT: s_addc_u32 s1, s5, 0 @@ -96,7 +96,7 @@ define amdgpu_kernel void @cluster_load_cluster_store(ptr noalias %lb, ptr noali ; ; GFX11-LABEL: cluster_load_cluster_store: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_clause 0x3 @@ -155,7 +155,7 @@ bb: define amdgpu_kernel void @cluster_load_valu_cluster_store(ptr noalias %lb, ptr noalias %sb) { ; GFX9-LABEL: cluster_load_valu_cluster_store: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -175,7 +175,7 @@ define amdgpu_kernel void @cluster_load_valu_cluster_store(ptr noalias %lb, ptr ; ; GFX10-LABEL: cluster_load_valu_cluster_store: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s4, 8 ; GFX10-NEXT: s_addc_u32 s1, s5, 0 @@ -223,7 +223,7 @@ define amdgpu_kernel void @cluster_load_valu_cluster_store(ptr noalias %lb, ptr ; ; GFX11-LABEL: cluster_load_valu_cluster_store: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_clause 0x3 diff --git a/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll b/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll index 9c7fa1537c0c2..b5e0589cf9e46 100644 --- a/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll +++ b/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @add1(ptr addrspace(1) nocapture %arg) { ; GCN-LABEL: add1: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 @@ -20,7 +20,7 @@ define amdgpu_kernel void @add1(ptr addrspace(1) nocapture %arg) { ; ; GFX9-LABEL: add1: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -88,7 +88,7 @@ bb: define amdgpu_kernel void @sub1(ptr addrspace(1) nocapture %arg) { ; GCN-LABEL: sub1: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 @@ -103,7 +103,7 @@ define amdgpu_kernel void @sub1(ptr addrspace(1) nocapture %arg) { ; ; GFX9-LABEL: sub1: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -127,33 +127,33 @@ bb: define amdgpu_kernel void @add_adde(ptr addrspace(1) nocapture %arg, i32 %a) { ; GCN-LABEL: add_adde: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s0, s[0:1], 0xb -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s4, s[2:3], 0xb +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v4, v[2:3], s[4:7], 0 addr64 -; GCN-NEXT: v_mov_b32_e32 v5, s0 +; GCN-NEXT: buffer_load_dword v4, v[2:3], s[0:3], 0 addr64 +; GCN-NEXT: v_mov_b32_e32 v5, s4 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_addc_u32_e32 v0, vcc, v5, v4, vcc -; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 +; GCN-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 ; GCN-NEXT: s_endpgm ; ; GFX9-LABEL: add_adde: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v3, v2, s[2:3] +; GFX9-NEXT: global_load_dword v3, v2, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v4, v3, vcc -; GFX9-NEXT: global_store_dword v2, v0, s[2:3] +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm bb: %x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -171,33 +171,33 @@ bb: define amdgpu_kernel void @adde_add(ptr addrspace(1) nocapture %arg, i32 %a) { ; GCN-LABEL: adde_add: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s0, s[0:1], 0xb -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s4, s[2:3], 0xb +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v4, v[2:3], s[4:7], 0 addr64 -; GCN-NEXT: v_mov_b32_e32 v5, s0 +; GCN-NEXT: buffer_load_dword v4, v[2:3], s[0:3], 0 addr64 +; GCN-NEXT: v_mov_b32_e32 v5, s4 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_addc_u32_e32 v0, vcc, v4, v5, vcc -; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 +; GCN-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 ; GCN-NEXT: s_endpgm ; ; GFX9-LABEL: adde_add: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v3, v2, s[2:3] +; GFX9-NEXT: global_load_dword v3, v2, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v4, vcc -; GFX9-NEXT: global_store_dword v2, v0, s[2:3] +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm bb: %x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -215,33 +215,33 @@ bb: define amdgpu_kernel void @sub_sube(ptr addrspace(1) nocapture %arg, i32 %a) { ; GCN-LABEL: sub_sube: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s0, s[0:1], 0xb -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s4, s[2:3], 0xb +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v4, v[2:3], s[4:7], 0 addr64 -; GCN-NEXT: v_mov_b32_e32 v5, s0 +; GCN-NEXT: buffer_load_dword v4, v[2:3], s[0:3], 0 addr64 +; GCN-NEXT: v_mov_b32_e32 v5, s4 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_subb_u32_e32 v0, vcc, v4, v5, vcc -; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 +; GCN-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 ; GCN-NEXT: s_endpgm ; ; GFX9-LABEL: sub_sube: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v3, v2, s[2:3] +; GFX9-NEXT: global_load_dword v3, v2, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v3, v4, vcc -; GFX9-NEXT: global_store_dword v2, v0, s[2:3] +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm bb: %x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -259,35 +259,35 @@ bb: define amdgpu_kernel void @sub_sube_commuted(ptr addrspace(1) nocapture %arg, i32 %a) { ; GCN-LABEL: sub_sube_commuted: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s0, s[0:1], 0xb -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s4, s[2:3], 0xb +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v4, v[2:3], s[4:7], 0 addr64 +; GCN-NEXT: buffer_load_dword v4, v[2:3], s[0:3], 0 addr64 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v4, vcc -; GCN-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 ; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 -; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 +; GCN-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 ; GCN-NEXT: s_endpgm ; ; GFX9-LABEL: sub_sube_commuted: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v3, v2, s[2:3] +; GFX9-NEXT: global_load_dword v3, v2, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_subbrev_co_u32_e32 v0, vcc, 0, v3, vcc ; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x64, v0 -; GFX9-NEXT: global_store_dword v2, v0, s[2:3] +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm bb: %x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -306,33 +306,33 @@ bb: define amdgpu_kernel void @sube_sub(ptr addrspace(1) nocapture %arg, i32 %a) { ; GCN-LABEL: sube_sub: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s0, s[0:1], 0xb -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s4, s[2:3], 0xb +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v4, v[2:3], s[4:7], 0 addr64 -; GCN-NEXT: v_mov_b32_e32 v5, s0 +; GCN-NEXT: buffer_load_dword v4, v[2:3], s[0:3], 0 addr64 +; GCN-NEXT: v_mov_b32_e32 v5, s4 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_subb_u32_e32 v0, vcc, v4, v5, vcc -; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 +; GCN-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 ; GCN-NEXT: s_endpgm ; ; GFX9-LABEL: sube_sub: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v3, v2, s[2:3] +; GFX9-NEXT: global_load_dword v3, v2, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v3, v4, vcc -; GFX9-NEXT: global_store_dword v2, v0, s[2:3] +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm bb: %x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -350,33 +350,33 @@ bb: define amdgpu_kernel void @zext_flclass(ptr addrspace(1) nocapture %arg, float %x) { ; GCN-LABEL: zext_flclass: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s0, s[0:1], 0xb -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s4, s[2:3], 0xb +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 ; GCN-NEXT: v_mov_b32_e32 v3, 0x260 -; GCN-NEXT: v_cmp_class_f32_e32 vcc, s0, v3 +; GCN-NEXT: v_cmp_class_f32_e32 vcc, s4, v3 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; GCN-NEXT: s_endpgm ; ; GFX9-LABEL: zext_flclass: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x260 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] ; GFX9-NEXT: v_cmp_class_f32_e32 vcc, s4, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm bb: %id = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -392,33 +392,33 @@ bb: define amdgpu_kernel void @sext_flclass(ptr addrspace(1) nocapture %arg, float %x) { ; GCN-LABEL: sext_flclass: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s0, s[0:1], 0xb -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s4, s[2:3], 0xb +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 ; GCN-NEXT: v_mov_b32_e32 v3, 0x260 -; GCN-NEXT: v_cmp_class_f32_e32 vcc, s0, v3 +; GCN-NEXT: v_cmp_class_f32_e32 vcc, s4, v3 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; GCN-NEXT: s_endpgm ; ; GFX9-LABEL: sext_flclass: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x260 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] ; GFX9-NEXT: v_cmp_class_f32_e32 vcc, s4, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm bb: %id = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -434,7 +434,7 @@ bb: define amdgpu_kernel void @add_and(ptr addrspace(1) nocapture %arg) { ; GCN-LABEL: add_and: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 @@ -450,7 +450,7 @@ define amdgpu_kernel void @add_and(ptr addrspace(1) nocapture %arg) { ; ; GFX9-LABEL: add_and: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_max_u32_e32 v1, 1, v1 ; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, v1, v0 @@ -478,7 +478,7 @@ bb: define amdgpu_kernel void @cmp_sub_sext(ptr addrspace(1) nocapture %arg) { ; GCN-LABEL: cmp_sub_sext: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 @@ -493,7 +493,7 @@ define amdgpu_kernel void @cmp_sub_sext(ptr addrspace(1) nocapture %arg) { ; ; GFX9-LABEL: cmp_sub_sext: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -518,7 +518,7 @@ bb: define amdgpu_kernel void @cmp_sub_zext(ptr addrspace(1) nocapture %arg) { ; GCN-LABEL: cmp_sub_zext: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 @@ -533,7 +533,7 @@ define amdgpu_kernel void @cmp_sub_zext(ptr addrspace(1) nocapture %arg) { ; ; GFX9-LABEL: cmp_sub_zext: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -557,33 +557,33 @@ bb: define amdgpu_kernel void @sub_addcarry(ptr addrspace(1) nocapture %arg, i32 %a) { ; GCN-LABEL: sub_addcarry: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s0, s[0:1], 0xb -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s4, s[2:3], 0xb +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v4, v[2:3], s[4:7], 0 addr64 +; GCN-NEXT: buffer_load_dword v4, v[2:3], s[0:3], 0 addr64 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v4, vcc -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 -; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 +; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s4, v0 +; GCN-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 ; GCN-NEXT: s_endpgm ; ; GFX9-LABEL: sub_addcarry: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v3, v2, s[2:3] +; GFX9-NEXT: global_load_dword v3, v2, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc ; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 -; GFX9-NEXT: global_store_dword v2, v0, s[2:3] +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm bb: %x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -601,33 +601,33 @@ bb: define amdgpu_kernel void @sub_subcarry(ptr addrspace(1) nocapture %arg, i32 %a) { ; GCN-LABEL: sub_subcarry: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s0, s[0:1], 0xb -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s4, s[2:3], 0xb +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v4, v[2:3], s[4:7], 0 addr64 -; GCN-NEXT: v_mov_b32_e32 v5, s0 +; GCN-NEXT: buffer_load_dword v4, v[2:3], s[0:3], 0 addr64 +; GCN-NEXT: v_mov_b32_e32 v5, s4 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_subb_u32_e32 v0, vcc, v4, v5, vcc -; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 +; GCN-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 ; GCN-NEXT: s_endpgm ; ; GFX9-LABEL: sub_subcarry: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v3, v2, s[2:3] +; GFX9-NEXT: global_load_dword v3, v2, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v3, v4, vcc -; GFX9-NEXT: global_store_dword v2, v0, s[2:3] +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm bb: %x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -646,7 +646,7 @@ bb: define amdgpu_kernel void @sub_zext_setcc_commute(ptr addrspace(1) nocapture %arg, i32 %a, i32%b) { ; GCN-LABEL: sub_zext_setcc_commute: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 @@ -665,7 +665,7 @@ define amdgpu_kernel void @sub_zext_setcc_commute(ptr addrspace(1) nocapture %ar ; ; GFX9-LABEL: sub_zext_setcc_commute: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc @@ -695,7 +695,7 @@ bb: define amdgpu_kernel void @sub_sext_setcc_commute(ptr addrspace(1) nocapture %arg, i32 %a, i32%b) { ; GCN-LABEL: sub_sext_setcc_commute: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 @@ -714,7 +714,7 @@ define amdgpu_kernel void @sub_sext_setcc_commute(ptr addrspace(1) nocapture %ar ; ; GFX9-LABEL: sub_sext_setcc_commute: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll index 3145ee1f6141e..ba0a1e75e29b7 100644 --- a/llvm/test/CodeGen/AMDGPU/ctlz.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll @@ -23,11 +23,11 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone define amdgpu_kernel void @s_ctlz_i32(ptr addrspace(1) noalias %out, i32 %val) nounwind { ; SI-LABEL: s_ctlz_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_flbit_i32_b32 s2, s2 +; SI-NEXT: s_flbit_i32_b32 s2, s4 ; SI-NEXT: s_min_u32 s4, s2, 32 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -36,8 +36,8 @@ define amdgpu_kernel void @s_ctlz_i32(ptr addrspace(1) noalias %out, i32 %val) n ; ; VI-LABEL: s_ctlz_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -62,36 +62,36 @@ define amdgpu_kernel void @s_ctlz_i32(ptr addrspace(1) noalias %out, i32 %val) n ; GFX10-LABEL: s_ctlz_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_flbit_i32_b32 s0, s4 -; GFX10-NEXT: s_min_u32 s0, s0, 32 -; GFX10-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-NEXT: s_flbit_i32_b32 s2, s4 +; GFX10-NEXT: s_min_u32 s2, s2, 32 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: s_ctlz_i32: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: s_flbit_i32_b32 s0, s4 -; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 32 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10-GISEL-NEXT: s_flbit_i32_b32 s2, s4 +; GFX10-GISEL-NEXT: s_min_u32 s2, s2, 32 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: s_ctlz_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_clz_i32_u32 s2, s2 +; GFX11-NEXT: s_clz_i32_u32 s2, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_min_u32 s2, s2, 32 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 @@ -107,7 +107,7 @@ define amdgpu_kernel void @s_ctlz_i32(ptr addrspace(1) noalias %out, i32 %val) n define amdgpu_kernel void @v_ctlz_i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -127,7 +127,7 @@ define amdgpu_kernel void @v_ctlz_i32(ptr addrspace(1) noalias %out, ptr addrspa ; ; VI-LABEL: v_ctlz_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -164,7 +164,7 @@ define amdgpu_kernel void @v_ctlz_i32(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX10-LABEL: v_ctlz_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -177,7 +177,7 @@ define amdgpu_kernel void @v_ctlz_i32(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX10-GISEL-LABEL: v_ctlz_i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -190,13 +190,14 @@ define amdgpu_kernel void @v_ctlz_i32(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX11-LABEL: v_ctlz_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -213,7 +214,7 @@ define amdgpu_kernel void @v_ctlz_i32(ptr addrspace(1) noalias %out, ptr addrspa define amdgpu_kernel void @v_ctlz_v2i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -235,7 +236,7 @@ define amdgpu_kernel void @v_ctlz_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; VI-LABEL: v_ctlz_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -277,7 +278,7 @@ define amdgpu_kernel void @v_ctlz_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX10-LABEL: v_ctlz_v2i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -292,7 +293,7 @@ define amdgpu_kernel void @v_ctlz_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX10-GISEL-LABEL: v_ctlz_v2i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -307,9 +308,11 @@ define amdgpu_kernel void @v_ctlz_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX11-LABEL: v_ctlz_v2i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -333,7 +336,7 @@ define amdgpu_kernel void @v_ctlz_v2i32(ptr addrspace(1) noalias %out, ptr addrs define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 @@ -359,7 +362,7 @@ define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrs ; ; VI-LABEL: v_ctlz_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -411,7 +414,7 @@ define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX10-LABEL: v_ctlz_v4i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -430,7 +433,7 @@ define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX10-GISEL-LABEL: v_ctlz_v4i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -449,9 +452,11 @@ define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX11-LABEL: v_ctlz_v4i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -480,7 +485,7 @@ define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrs define amdgpu_kernel void @v_ctlz_i8(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -500,7 +505,7 @@ define amdgpu_kernel void @v_ctlz_i8(ptr addrspace(1) noalias %out, ptr addrspac ; ; VI-LABEL: v_ctlz_i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -550,7 +555,7 @@ define amdgpu_kernel void @v_ctlz_i8(ptr addrspace(1) noalias %out, ptr addrspac ; ; GFX10-LABEL: v_ctlz_i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v1, v0, s[6:7] @@ -563,7 +568,7 @@ define amdgpu_kernel void @v_ctlz_i8(ptr addrspace(1) noalias %out, ptr addrspac ; ; GFX10-GISEL-LABEL: v_ctlz_i8: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] @@ -576,7 +581,7 @@ define amdgpu_kernel void @v_ctlz_i8(ptr addrspace(1) noalias %out, ptr addrspac ; ; GFX11-LABEL: v_ctlz_i8: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v1, v0, s[2:3] @@ -598,8 +603,8 @@ define amdgpu_kernel void @v_ctlz_i8(ptr addrspace(1) noalias %out, ptr addrspac define amdgpu_kernel void @s_ctlz_i64(ptr addrspace(1) noalias %out, [8 x i32], i64 %val) nounwind { ; SI-LABEL: s_ctlz_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -612,8 +617,8 @@ define amdgpu_kernel void @s_ctlz_i64(ptr addrspace(1) noalias %out, [8 x i32], ; ; VI-LABEL: s_ctlz_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v1, 0 @@ -645,11 +650,11 @@ define amdgpu_kernel void @s_ctlz_i64(ptr addrspace(1) noalias %out, [8 x i32], ; GFX10-LABEL: s_ctlz_i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_flbit_i32_b64 s0, s[2:3] +; GFX10-NEXT: s_flbit_i32_b64 s0, s[0:1] ; GFX10-NEXT: s_min_u32 s0, s0, 64 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] @@ -658,12 +663,12 @@ define amdgpu_kernel void @s_ctlz_i64(ptr addrspace(1) noalias %out, [8 x i32], ; GFX10-GISEL-LABEL: s_ctlz_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX10-GISEL-NEXT: s_mov_b32 s1, 0 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: s_flbit_i32_b64 s0, s[2:3] +; GFX10-GISEL-NEXT: s_flbit_i32_b64 s0, s[0:1] +; GFX10-GISEL-NEXT: s_mov_b32 s1, 0 ; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 64 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1 @@ -673,14 +678,14 @@ define amdgpu_kernel void @s_ctlz_i64(ptr addrspace(1) noalias %out, [8 x i32], ; GFX11-LABEL: s_ctlz_i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x4c +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_clz_i32_u64 s2, s[2:3] +; GFX11-NEXT: s_clz_i32_u64 s0, s[0:1] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_min_u32 s2, s2, 64 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX11-NEXT: global_store_b64 v1, v[0:1], s[0:1] +; GFX11-NEXT: s_min_u32 s0, s0, 64 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s0 +; GFX11-NEXT: global_store_b64 v1, v[0:1], s[2:3] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -692,7 +697,7 @@ define amdgpu_kernel void @s_ctlz_i64(ptr addrspace(1) noalias %out, [8 x i32], define amdgpu_kernel void @s_ctlz_i64_trunc(ptr addrspace(1) noalias %out, i64 %val) nounwind { ; SI-LABEL: s_ctlz_i64_trunc: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_flbit_i32_b64 s2, s[2:3] @@ -706,7 +711,7 @@ define amdgpu_kernel void @s_ctlz_i64_trunc(ptr addrspace(1) noalias %out, i64 % ; ; VI-LABEL: s_ctlz_i64_trunc: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -737,7 +742,7 @@ define amdgpu_kernel void @s_ctlz_i64_trunc(ptr addrspace(1) noalias %out, i64 % ; ; GFX10-LABEL: s_ctlz_i64_trunc: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_flbit_i32_b64 s0, s[6:7] @@ -748,7 +753,7 @@ define amdgpu_kernel void @s_ctlz_i64_trunc(ptr addrspace(1) noalias %out, i64 % ; ; GFX10-GISEL-LABEL: s_ctlz_i64_trunc: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_flbit_i32_b64 s0, s[6:7] @@ -759,7 +764,7 @@ define amdgpu_kernel void @s_ctlz_i64_trunc(ptr addrspace(1) noalias %out, i64 % ; ; GFX11-LABEL: s_ctlz_i64_trunc: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clz_i32_u64 s2, s[2:3] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) @@ -778,7 +783,7 @@ define amdgpu_kernel void @s_ctlz_i64_trunc(ptr addrspace(1) noalias %out, i64 % define amdgpu_kernel void @v_ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctlz_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -799,7 +804,7 @@ define amdgpu_kernel void @v_ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; VI-LABEL: v_ctlz_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -847,7 +852,7 @@ define amdgpu_kernel void @v_ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX10-LABEL: v_ctlz_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] @@ -862,7 +867,7 @@ define amdgpu_kernel void @v_ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX10-GISEL-LABEL: v_ctlz_i64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] @@ -878,7 +883,9 @@ define amdgpu_kernel void @v_ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX11-LABEL: v_ctlz_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] @@ -905,7 +912,7 @@ define amdgpu_kernel void @v_ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspa define amdgpu_kernel void @v_ctlz_i64_trunc(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctlz_i64_trunc: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 @@ -926,7 +933,7 @@ define amdgpu_kernel void @v_ctlz_i64_trunc(ptr addrspace(1) noalias %out, ptr a ; ; VI-LABEL: v_ctlz_i64_trunc: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -974,7 +981,7 @@ define amdgpu_kernel void @v_ctlz_i64_trunc(ptr addrspace(1) noalias %out, ptr a ; ; GFX10-LABEL: v_ctlz_i64_trunc: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -989,7 +996,7 @@ define amdgpu_kernel void @v_ctlz_i64_trunc(ptr addrspace(1) noalias %out, ptr a ; ; GFX10-GISEL-LABEL: v_ctlz_i64_trunc: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1005,18 +1012,20 @@ define amdgpu_kernel void @v_ctlz_i64_trunc(ptr addrspace(1) noalias %out, ptr a ; ; GFX11-LABEL: v_ctlz_i64_trunc: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[1:2], v1, s[2:3] +; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 ; GFX11-NEXT: v_clz_i32_u32_e32 v1, v1 -; GFX11-NEXT: v_clz_i32_u32_e32 v2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_nc_u32_e64 v1, v1, 32 clamp -; GFX11-NEXT: v_min3_u32 v1, v1, v2, 64 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: v_add_nc_u32_e64 v0, v0, 32 clamp +; GFX11-NEXT: v_min3_u32 v0, v0, v1, 64 +; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1033,7 +1042,7 @@ define amdgpu_kernel void @v_ctlz_i64_trunc(ptr addrspace(1) noalias %out, ptr a define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i32_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1052,7 +1061,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_ctlz_i32_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1090,7 +1099,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-LABEL: v_ctlz_i32_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1102,7 +1111,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-GISEL-LABEL: v_ctlz_i32_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[6:7] @@ -1117,8 +1126,10 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; GFX11-LABEL: v_ctlz_i32_sel_eq_neg1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1140,7 +1151,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i32_sel_ne_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1159,7 +1170,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_ctlz_i32_sel_ne_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1197,7 +1208,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-LABEL: v_ctlz_i32_sel_ne_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1209,7 +1220,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-GISEL-LABEL: v_ctlz_i32_sel_ne_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[6:7] @@ -1224,8 +1235,10 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; ; GFX11-LABEL: v_ctlz_i32_sel_ne_neg1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1248,7 +1261,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i32_sel_eq_bitwidth: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1270,7 +1283,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_ctlz_i32_sel_eq_bitwidth: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1313,7 +1326,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_ctlz_i32_sel_eq_bitwidth: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1328,7 +1341,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-GISEL-LABEL: v_ctlz_i32_sel_eq_bitwidth: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1343,14 +1356,16 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias % ; ; GFX11-LABEL: v_ctlz_i32_sel_eq_bitwidth: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_min_u32_e32 v0, 32, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 ; GFX11-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1370,7 +1385,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias % define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i32_sel_ne_bitwidth: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1392,7 +1407,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_ctlz_i32_sel_ne_bitwidth: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1435,7 +1450,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_ctlz_i32_sel_ne_bitwidth: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1450,7 +1465,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-GISEL-LABEL: v_ctlz_i32_sel_ne_bitwidth: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1465,14 +1480,16 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX11-LABEL: v_ctlz_i32_sel_ne_bitwidth: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_min_u32_e32 v0, 32, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 ; GFX11-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1492,7 +1509,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % define amdgpu_kernel void @v_ctlz_i8_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i8_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s10, 0 @@ -1510,7 +1527,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_ctlz_i8_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -1552,7 +1569,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_ctlz_i8_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v0, v0, s[6:7] @@ -1563,7 +1580,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-GISEL-LABEL: v_ctlz_i8_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s6 @@ -1583,8 +1600,8 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX11-LABEL: v_ctlz_i8_sel_eq_neg1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1606,7 +1623,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % define amdgpu_kernel void @v_ctlz_i16_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i16_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1624,7 +1641,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_ctlz_i16_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1674,7 +1691,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_ctlz_i16_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] @@ -1689,7 +1706,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-GISEL-LABEL: v_ctlz_i16_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_ushort v1, v0, s[6:7] @@ -1705,7 +1722,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX11-LABEL: v_ctlz_i16_sel_eq_neg1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] @@ -1733,7 +1750,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i7_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s10, 0 @@ -1752,7 +1769,7 @@ define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_ctlz_i7_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -1795,7 +1812,7 @@ define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-LABEL: v_ctlz_i7_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v0, v0, s[6:7] @@ -1807,7 +1824,7 @@ define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-GISEL-LABEL: v_ctlz_i7_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s6 @@ -1829,13 +1846,14 @@ define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; GFX11-LABEL: v_ctlz_i7_sel_eq_neg1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x7f, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x7f, v0 ; GFX11-NEXT: global_store_b8 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll index a377714ebf737..a55c8cdc9b6e8 100644 --- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll @@ -29,11 +29,11 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone define amdgpu_kernel void @s_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, i32 %val) nounwind { ; SI-LABEL: s_ctlz_zero_undef_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_flbit_i32_b32 s4, s2 +; SI-NEXT: s_flbit_i32_b32 s4, s4 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -41,10 +41,10 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, ; ; VI-LABEL: s_ctlz_zero_undef_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_flbit_i32_b32 s2, s2 +; VI-NEXT: s_flbit_i32_b32 s2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -64,13 +64,13 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, ; ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_flbit_i32_b32 s0, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[2:3] +; GFX9-GISEL-NEXT: s_flbit_i32_b32 s2, s4 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone store i32 %ctlz, ptr addrspace(1) %out, align 4 @@ -80,7 +80,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -99,7 +99,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_ctlz_zero_undef_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -134,7 +134,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -154,7 +154,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -174,7 +174,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(ptr addrspace(1) noalias %out ; ; VI-LABEL: v_ctlz_zero_undef_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -211,7 +211,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(ptr addrspace(1) noalias %out ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -232,7 +232,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(ptr addrspace(1) noalias %out define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 @@ -254,7 +254,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(ptr addrspace(1) noalias %out ; ; VI-LABEL: v_ctlz_zero_undef_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -295,7 +295,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(ptr addrspace(1) noalias %out ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v4i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -318,11 +318,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(ptr addrspace(1) noalias %out define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noalias %out, i8 %val) nounwind { ; SI-LABEL: s_ctlz_zero_undef_i8_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s2, s2, 24 +; SI-NEXT: s_lshl_b32 s2, s4, 24 ; SI-NEXT: s_flbit_i32_b32 s4, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -331,10 +331,10 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa ; ; VI-LABEL: s_ctlz_zero_undef_i8_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s2, s2, 24 +; VI-NEXT: s_lshl_b32 s2, s4, 24 ; VI-NEXT: s_flbit_i32_b32 s2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -373,14 +373,14 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa ; ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i8_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_lshl_b32 s0, s4, 24 -; GFX9-GISEL-NEXT: s_flbit_i32_b32 s0, s0 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[2:3] +; GFX9-GISEL-NEXT: s_lshl_b32 s2, s4, 24 +; GFX9-GISEL-NEXT: s_flbit_i32_b32 s2, s2 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %ctlz = tail call i8 @llvm.ctlz.i8(i8 %val, i1 true) nounwind readnone %ctlz_ret = icmp ne i8 %val, 0 @@ -392,11 +392,11 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) noalias %out, i16 %val) nounwind { ; SI-LABEL: s_ctlz_zero_undef_i16_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s2, s2, 16 +; SI-NEXT: s_lshl_b32 s2, s4, 16 ; SI-NEXT: s_flbit_i32_b32 s4, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -405,12 +405,11 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; ; VI-LABEL: s_ctlz_zero_undef_i16_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s2, 0xffff +; VI-NEXT: s_lshl_b32 s2, s4, 16 ; VI-NEXT: s_flbit_i32_b32 s2, s2 -; VI-NEXT: s_add_i32 s2, s2, -16 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -448,14 +447,14 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i16_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_lshl_b32 s0, s4, 16 -; GFX9-GISEL-NEXT: s_flbit_i32_b32 s0, s0 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-GISEL-NEXT: global_store_short v1, v0, s[2:3] +; GFX9-GISEL-NEXT: s_lshl_b32 s2, s4, 16 +; GFX9-GISEL-NEXT: s_flbit_i32_b32 s2, s2 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-GISEL-NEXT: global_store_short v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %ctlz = tail call i16 @llvm.ctlz.i16(i16 %val, i1 true) nounwind readnone %ctlz_ret = icmp ne i16 %val, 0 @@ -467,11 +466,11 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no define amdgpu_kernel void @s_ctlz_zero_undef_i32_with_select(ptr addrspace(1) noalias %out, i32 %val) nounwind { ; SI-LABEL: s_ctlz_zero_undef_i32_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_flbit_i32_b32 s4, s2 +; SI-NEXT: s_flbit_i32_b32 s4, s4 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -479,10 +478,10 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no ; ; VI-LABEL: s_ctlz_zero_undef_i32_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_flbit_i32_b32 s2, s2 +; VI-NEXT: s_flbit_i32_b32 s2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -502,13 +501,13 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i32_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_flbit_i32_b32 s0, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[2:3] +; GFX9-GISEL-NEXT: s_flbit_i32_b32 s2, s4 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %ctlz = tail call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone %ctlz_ret = icmp ne i32 %val, 0 @@ -520,7 +519,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no define amdgpu_kernel void @s_ctlz_zero_undef_i64_with_select(ptr addrspace(1) noalias %out, i64 %val) nounwind { ; SI-LABEL: s_ctlz_zero_undef_i64_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -534,7 +533,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no ; ; VI-LABEL: s_ctlz_zero_undef_i64_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_flbit_i32_b64 s2, s[2:3] @@ -562,7 +561,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i64_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-GISEL-NEXT: s_mov_b32 s1, 0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -581,7 +580,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i8_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -602,7 +601,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa ; ; VI-LABEL: v_ctlz_zero_undef_i8_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -650,7 +649,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i8_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] @@ -673,7 +672,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i16_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -698,7 +697,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; ; VI-LABEL: v_ctlz_zero_undef_i16_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 1 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -712,8 +711,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_ffbh_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, -16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; VI-NEXT: v_ffbh_u32_e32 v1, v1 ; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v2, 32, v1, vcc ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -754,7 +753,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i16_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] @@ -779,7 +778,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i32_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -810,7 +809,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no ; ; VI-LABEL: v_ctlz_zero_undef_i32_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 3 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -870,7 +869,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] @@ -900,7 +899,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i64_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s10, s2 @@ -947,7 +946,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no ; ; VI-LABEL: v_ctlz_zero_undef_i64_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 5 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -1051,7 +1050,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v0, v1, s[6:7] @@ -1095,7 +1094,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s10, 0 @@ -1114,7 +1113,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p ; ; VI-LABEL: v_ctlz_zero_undef_i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -1159,7 +1158,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i8: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s6 @@ -1184,8 +1183,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @s_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, [8 x i32], i64 %val) nounwind { ; SI-LABEL: s_ctlz_zero_undef_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1197,14 +1196,14 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, ; ; VI-LABEL: s_ctlz_zero_undef_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_flbit_i32_b64 s2, s[2:3] -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_flbit_i32_b64 s0, s[0:1] +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; @@ -1226,14 +1225,14 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, ; ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i64: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c -; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-GISEL-NEXT: s_mov_b32 s1, 0 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_mov_b32 s3, 0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_flbit_i32_b64 s0, s[2:3] -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-GISEL-NEXT: s_flbit_i32_b64 s2, s[0:1] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true) @@ -1244,7 +1243,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias %out, i64 %val) nounwind { ; SI-LABEL: s_ctlz_zero_undef_i64_trunc: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_flbit_i32_b64 s2, s[2:3] @@ -1257,7 +1256,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias ; ; VI-LABEL: s_ctlz_zero_undef_i64_trunc: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_flbit_i32_b64 s2, s[2:3] ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1283,7 +1282,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias ; ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i64_trunc: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_flbit_i32_b64 s0, s[6:7] @@ -1299,7 +1298,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias define amdgpu_kernel void @v_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -1319,7 +1318,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_ctlz_zero_undef_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1365,7 +1364,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] @@ -1389,7 +1388,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i64_trunc: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 @@ -1409,7 +1408,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias ; ; VI-LABEL: v_ctlz_zero_undef_i64_trunc: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1455,7 +1454,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64_trunc: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1480,7 +1479,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1499,7 +1498,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(ptr addrspace(1) no ; ; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1535,7 +1534,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[6:7] @@ -1559,7 +1558,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(ptr addrspace(1) no define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i32_sel_ne_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1578,7 +1577,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(ptr addrspace(1) no ; ; VI-LABEL: v_ctlz_zero_undef_i32_sel_ne_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1614,7 +1613,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[6:7] @@ -1638,7 +1637,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(ptr addrspace(1) no define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i8_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s10, 0 @@ -1656,7 +1655,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noa ; ; VI-LABEL: v_ctlz_zero_undef_i8_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -1698,7 +1697,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noa ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i8_sel_eq_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s6 @@ -1727,7 +1726,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noa define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1_two_use: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1751,7 +1750,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(ptr addrspa ; ; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1_two_use: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1800,7 +1799,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(ptr addrspa ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1_two_use: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1830,7 +1829,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(ptr addrspa define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i32_sel_eq_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1851,7 +1850,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(ptr addrspace(1) noali ; ; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1889,7 +1888,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(ptr addrspace(1) noali ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[6:7] @@ -1914,7 +1913,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(ptr addrspace(1) noali define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i32_sel_ne_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1935,7 +1934,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(ptr addrspace(1) noali ; ; VI-LABEL: v_ctlz_zero_undef_i32_sel_ne_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1973,7 +1972,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(ptr addrspace(1) noali ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[6:7] @@ -1998,7 +1997,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(ptr addrspace(1) noali define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i32_sel_eq_cmp_non0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2019,7 +2018,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(ptr addrspace(1 ; ; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_cmp_non0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2058,7 +2057,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(ptr addrspace(1 ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_cmp_non0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[6:7] @@ -2083,7 +2082,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(ptr addrspace(1 define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i32_sel_ne_cmp_non0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2104,7 +2103,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(ptr addrspace(1 ; ; VI-LABEL: v_ctlz_zero_undef_i32_sel_ne_cmp_non0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2143,7 +2142,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(ptr addrspace(1 ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_cmp_non0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[6:7] @@ -2168,18 +2167,15 @@ define i7 @v_ctlz_zero_undef_i7(i7 %val) { ; SI-LABEL: v_ctlz_zero_undef_i7: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0x7f, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 25, v0 ; SI-NEXT: v_ffbh_u32_e32 v0, v0 -; SI-NEXT: v_subrev_i32_e32 v0, vcc, 25, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_ctlz_zero_undef_i7: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v0, 0x7f, v0 -; VI-NEXT: v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v0, vcc, -16, v0 -; VI-NEXT: v_add_u16_e32 v0, -9, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 25, v0 +; VI-NEXT: v_ffbh_u32_e32 v0, v0 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; EG-LABEL: v_ctlz_zero_undef_i7: @@ -2200,13 +2196,12 @@ define i7 @v_ctlz_zero_undef_i7(i7 %val) { define amdgpu_kernel void @s_ctlz_zero_undef_i18(ptr addrspace(1) noalias %out, i18 %val) nounwind { ; SI-LABEL: s_ctlz_zero_undef_i18: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s2, s2, 0x3ffff -; SI-NEXT: s_flbit_i32_b32 s2, s2 -; SI-NEXT: s_add_i32 s4, s2, -14 +; SI-NEXT: s_lshl_b32 s2, s4, 14 +; SI-NEXT: s_flbit_i32_b32 s4, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: s_bfe_u32 s4, s4, 0x20010 @@ -2218,18 +2213,17 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i18(ptr addrspace(1) noalias %out, ; ; VI-LABEL: s_ctlz_zero_undef_i18: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s2, 0x3ffff -; VI-NEXT: s_flbit_i32_b32 s2, s2 +; VI-NEXT: s_lshl_b32 s2, s4, 14 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_add_i32 s2, s2, -14 +; VI-NEXT: s_flbit_i32_b32 s2, s2 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: s_add_u32 s0, s0, 2 -; VI-NEXT: flat_store_short v[0:1], v2 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_bfe_u32 s2, s2, 0x20010 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -2239,20 +2233,18 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i18(ptr addrspace(1) noalias %out, ; ; EG-LABEL: s_ctlz_zero_undef_i18: ; EG: ; %bb.0: -; EG-NEXT: ALU 30, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 28, @4, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT MSKOR T1.XW, T3.X ; EG-NEXT: MEM_RAT MSKOR T0.XW, T2.X ; EG-NEXT: CF_END ; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: AND_INT * T0.W, KC0[2].Z, literal.x, -; EG-NEXT: 262143(3.673406e-40), 0(0.000000e+00) +; EG-NEXT: LSHL * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 14(1.961818e-44), 0(0.000000e+00) ; EG-NEXT: FFBH_UINT T0.W, PV.W, ; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) -; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x, -; EG-NEXT: -14(nan), 0(0.000000e+00) ; EG-NEXT: AND_INT T2.W, PV.W, literal.x, -; EG-NEXT: LSHL * T1.W, T1.W, literal.y, +; EG-NEXT: LSHL * T1.W, PS, literal.y, ; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45) ; EG-NEXT: LSHL T1.X, PV.W, PS, ; EG-NEXT: LSHL * T1.W, literal.x, PS, @@ -2278,18 +2270,18 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i18(ptr addrspace(1) noalias %out, ; ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i18: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_lshl_b32 s0, s4, 14 -; GFX9-GISEL-NEXT: s_flbit_i32_b32 s0, s0 -; GFX9-GISEL-NEXT: s_and_b32 s0, s0, 0x3ffff -; GFX9-GISEL-NEXT: s_lshr_b32 s1, s0, 16 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-GISEL-NEXT: global_store_short v0, v1, s[2:3] -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[2:3] offset:2 +; GFX9-GISEL-NEXT: s_lshl_b32 s2, s4, 14 +; GFX9-GISEL-NEXT: s_flbit_i32_b32 s2, s2 +; GFX9-GISEL-NEXT: s_and_b32 s2, s2, 0x3ffff +; GFX9-GISEL-NEXT: s_lshr_b32 s3, s2, 16 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-GISEL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[0:1] offset:2 ; GFX9-GISEL-NEXT: s_endpgm %ctlz = call i18 @llvm.ctlz.i18(i18 %val, i1 true) nounwind readnone store i18 %ctlz, ptr addrspace(1) %out, align 4 @@ -2300,17 +2292,15 @@ define i18 @v_ctlz_zero_undef_i18(i18 %val) { ; SI-LABEL: v_ctlz_zero_undef_i18: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0x3ffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 14, v0 ; SI-NEXT: v_ffbh_u32_e32 v0, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, -14, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_ctlz_zero_undef_i18: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v0, 0x3ffff, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 14, v0 ; VI-NEXT: v_ffbh_u32_e32 v0, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, -14, v0 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; EG-LABEL: v_ctlz_zero_undef_i18: @@ -2332,23 +2322,19 @@ define <2 x i18> @v_ctlz_zero_undef_v2i18(<2 x i18> %val) { ; SI-LABEL: v_ctlz_zero_undef_v2i18: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0x3ffff, v1 -; SI-NEXT: v_and_b32_e32 v0, 0x3ffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 14, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 14, v1 ; SI-NEXT: v_ffbh_u32_e32 v0, v0 ; SI-NEXT: v_ffbh_u32_e32 v1, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, -14, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, -14, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_ctlz_zero_undef_v2i18: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v1, 0x3ffff, v1 -; VI-NEXT: v_and_b32_e32 v0, 0x3ffff, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 14, v0 +; VI-NEXT: v_lshlrev_b32_e32 v1, 14, v1 ; VI-NEXT: v_ffbh_u32_e32 v0, v0 ; VI-NEXT: v_ffbh_u32_e32 v1, v1 -; VI-NEXT: v_add_u32_e32 v0, vcc, -14, v0 -; VI-NEXT: v_add_u32_e32 v1, vcc, -14, v1 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; EG-LABEL: v_ctlz_zero_undef_v2i18: @@ -2383,11 +2369,11 @@ define <2 x i16> @v_ctlz_zero_undef_v2i16(<2 x i16> %val) { ; VI-LABEL: v_ctlz_zero_undef_v2i16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_ffbh_u32_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v0, vcc, -16, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0xfff00000, v0 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_ffbh_u32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI-NEXT: v_ffbh_u32_e32 v0, v0 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; ; EG-LABEL: v_ctlz_zero_undef_v2i16: @@ -2429,13 +2415,13 @@ define <3 x i16> @v_ctlz_zero_undef_v3i16(<3 x i16> %val) { ; VI-LABEL: v_ctlz_zero_undef_v3i16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_ffbh_u32_sdwa v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v0, vcc, -16, v0 -; VI-NEXT: v_ffbh_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, -16, v1 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0xfff00000, v0 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_ffbh_u32_e32 v2, v2 +; VI-NEXT: v_ffbh_u32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI-NEXT: v_ffbh_u32_e32 v1, v1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; ; EG-LABEL: v_ctlz_zero_undef_v3i16: @@ -2483,16 +2469,16 @@ define <4 x i16> @v_ctlz_zero_undef_v4i16(<4 x i16> %val) { ; VI-LABEL: v_ctlz_zero_undef_v4i16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_ffbh_u32_sdwa v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_ffbh_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; VI-NEXT: v_ffbh_u32_sdwa v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, -16, v1 -; VI-NEXT: v_add_u32_e32 v0, vcc, -16, v0 -; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0xfff00000, v0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0xfff00000, v1 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_ffbh_u32_e32 v2, v2 +; VI-NEXT: v_ffbh_u32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI-NEXT: v_ffbh_u32_e32 v3, v3 +; VI-NEXT: v_ffbh_u32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; ; EG-LABEL: v_ctlz_zero_undef_v4i16: @@ -2567,28 +2553,19 @@ define <2 x i7> @v_ctlz_zero_undef_v2i7(<2 x i7> %val) { ; SI-LABEL: v_ctlz_zero_undef_v2i7: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0x7f, v1 -; SI-NEXT: v_and_b32_e32 v0, 0x7f, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 25, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 25, v1 ; SI-NEXT: v_ffbh_u32_e32 v0, v0 ; SI-NEXT: v_ffbh_u32_e32 v1, v1 -; SI-NEXT: v_subrev_i32_e32 v0, vcc, 25, v0 -; SI-NEXT: v_subrev_i32_e32 v1, vcc, 25, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_ctlz_zero_undef_v2i7: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_and_b32_e32 v2, 0x7f007f, v0 -; VI-NEXT: v_bfe_u32 v0, v0, 16, 7 +; VI-NEXT: v_lshlrev_b32_e32 v0, 25, v0 +; VI-NEXT: v_lshlrev_b32_e32 v1, 25, v1 ; VI-NEXT: v_ffbh_u32_e32 v0, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, -16, v0 -; VI-NEXT: v_add_u16_e32 v1, -9, v0 -; VI-NEXT: v_and_b32_e32 v0, 0x7f, v2 -; VI-NEXT: v_ffbh_u32_e32 v0, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, -16, v0 -; VI-NEXT: v_add_u16_e32 v0, -9, v0 +; VI-NEXT: v_ffbh_u32_e32 v1, v1 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; EG-LABEL: v_ctlz_zero_undef_v2i7: diff --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll index b6359f1816979..40929d5883447 100644 --- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll +++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll @@ -14,8 +14,8 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone define amdgpu_kernel void @s_ctpop_i16(ptr addrspace(1) noalias %out, i16 %val) nounwind { ; SI-LABEL: s_ctpop_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -27,8 +27,8 @@ define amdgpu_kernel void @s_ctpop_i16(ptr addrspace(1) noalias %out, i16 %val) ; ; VI-LABEL: s_ctpop_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -72,7 +72,7 @@ define amdgpu_kernel void @s_ctpop_i16(ptr addrspace(1) noalias %out, i16 %val) define amdgpu_kernel void @v_ctpop_i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -91,7 +91,7 @@ define amdgpu_kernel void @v_ctpop_i16(ptr addrspace(1) noalias %out, ptr addrsp ; ; VI-LABEL: v_ctpop_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -142,8 +142,8 @@ define amdgpu_kernel void @v_ctpop_i16(ptr addrspace(1) noalias %out, ptr addrsp define amdgpu_kernel void @v_ctpop_add_chain_i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in0, ptr addrspace(1) noalias %in1) nounwind { ; SI-LABEL: v_ctpop_add_chain_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: s_mov_b32 s15, s11 @@ -166,8 +166,8 @@ define amdgpu_kernel void @v_ctpop_add_chain_i16(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_ctpop_add_chain_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -239,8 +239,8 @@ define amdgpu_kernel void @v_ctpop_add_chain_i16(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_ctpop_add_sgpr_i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i16 %sval) nounwind { ; SI-LABEL: v_ctpop_add_sgpr_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s12, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dword s12, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s3 @@ -259,8 +259,8 @@ define amdgpu_kernel void @v_ctpop_add_sgpr_i16(ptr addrspace(1) noalias %out, p ; ; VI-LABEL: v_ctpop_add_sgpr_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -320,7 +320,7 @@ define amdgpu_kernel void @v_ctpop_add_sgpr_i16(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @v_ctpop_v2i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_v2i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -344,7 +344,7 @@ define amdgpu_kernel void @v_ctpop_v2i16(ptr addrspace(1) noalias %out, ptr addr ; ; VI-LABEL: v_ctpop_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -400,7 +400,7 @@ define amdgpu_kernel void @v_ctpop_v2i16(ptr addrspace(1) noalias %out, ptr addr define amdgpu_kernel void @v_ctpop_v4i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_v4i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -430,7 +430,7 @@ define amdgpu_kernel void @v_ctpop_v4i16(ptr addrspace(1) noalias %out, ptr addr ; ; VI-LABEL: v_ctpop_v4i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -520,7 +520,7 @@ define amdgpu_kernel void @v_ctpop_v4i16(ptr addrspace(1) noalias %out, ptr addr define amdgpu_kernel void @v_ctpop_v8i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_v8i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s3 @@ -562,7 +562,7 @@ define amdgpu_kernel void @v_ctpop_v8i16(ptr addrspace(1) noalias %out, ptr addr ; ; VI-LABEL: v_ctpop_v8i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -700,7 +700,7 @@ define amdgpu_kernel void @v_ctpop_v8i16(ptr addrspace(1) noalias %out, ptr addr define amdgpu_kernel void @v_ctpop_v16i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_v16i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s3 @@ -769,7 +769,7 @@ define amdgpu_kernel void @v_ctpop_v16i16(ptr addrspace(1) noalias %out, ptr add ; ; VI-LABEL: v_ctpop_v16i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 5, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1016,7 +1016,7 @@ define amdgpu_kernel void @v_ctpop_v16i16(ptr addrspace(1) noalias %out, ptr add define amdgpu_kernel void @v_ctpop_i16_add_inline_constant(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_i16_add_inline_constant: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -1035,7 +1035,7 @@ define amdgpu_kernel void @v_ctpop_i16_add_inline_constant(ptr addrspace(1) noal ; ; VI-LABEL: v_ctpop_i16_add_inline_constant: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1088,7 +1088,7 @@ define amdgpu_kernel void @v_ctpop_i16_add_inline_constant(ptr addrspace(1) noal define amdgpu_kernel void @v_ctpop_i16_add_inline_constant_inv(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_i16_add_inline_constant_inv: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -1107,7 +1107,7 @@ define amdgpu_kernel void @v_ctpop_i16_add_inline_constant_inv(ptr addrspace(1) ; ; VI-LABEL: v_ctpop_i16_add_inline_constant_inv: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1160,7 +1160,7 @@ define amdgpu_kernel void @v_ctpop_i16_add_inline_constant_inv(ptr addrspace(1) define amdgpu_kernel void @v_ctpop_i16_add_literal(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_i16_add_literal: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -1180,7 +1180,7 @@ define amdgpu_kernel void @v_ctpop_i16_add_literal(ptr addrspace(1) noalias %out ; ; VI-LABEL: v_ctpop_i16_add_literal: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_movk_i32 s4, 0x3e7 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1234,8 +1234,8 @@ define amdgpu_kernel void @v_ctpop_i16_add_literal(ptr addrspace(1) noalias %out define amdgpu_kernel void @v_ctpop_i16_add_var(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i16 %const) nounwind { ; SI-LABEL: v_ctpop_i16_add_var: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s12, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dword s12, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s3 @@ -1254,8 +1254,8 @@ define amdgpu_kernel void @v_ctpop_i16_add_var(ptr addrspace(1) noalias %out, pt ; ; VI-LABEL: v_ctpop_i16_add_var: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -1315,8 +1315,8 @@ define amdgpu_kernel void @v_ctpop_i16_add_var(ptr addrspace(1) noalias %out, pt define amdgpu_kernel void @v_ctpop_i16_add_var_inv(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i16 %const) nounwind { ; SI-LABEL: v_ctpop_i16_add_var_inv: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s12, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dword s12, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s3 @@ -1335,8 +1335,8 @@ define amdgpu_kernel void @v_ctpop_i16_add_var_inv(ptr addrspace(1) noalias %out ; ; VI-LABEL: v_ctpop_i16_add_var_inv: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -1396,8 +1396,8 @@ define amdgpu_kernel void @v_ctpop_i16_add_var_inv(ptr addrspace(1) noalias %out define amdgpu_kernel void @v_ctpop_i16_add_vvar_inv(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %constptr) nounwind { ; SI-LABEL: v_ctpop_i16_add_vvar_inv: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: s_mov_b32 s15, s11 @@ -1418,8 +1418,8 @@ define amdgpu_kernel void @v_ctpop_i16_add_vvar_inv(ptr addrspace(1) noalias %ou ; ; VI-LABEL: v_ctpop_i16_add_vvar_inv: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -1487,8 +1487,8 @@ define amdgpu_kernel void @v_ctpop_i16_add_vvar_inv(ptr addrspace(1) noalias %ou define amdgpu_kernel void @ctpop_i16_in_br(ptr addrspace(1) %out, ptr addrspace(1) %in, i16 %ctpop_arg, i16 %cond) { ; SI-LABEL: ctpop_i16_in_br: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshr_b32 s5, s4, 16 ; SI-NEXT: s_cmp_lg_u32 s5, 0 @@ -1517,8 +1517,8 @@ define amdgpu_kernel void @ctpop_i16_in_br(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: ctpop_i16_in_br: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s5, s4, 16 ; VI-NEXT: s_cmp_lg_u32 s5, 0 diff --git a/llvm/test/CodeGen/AMDGPU/ctpop64.ll b/llvm/test/CodeGen/AMDGPU/ctpop64.ll index 131ce14a7847c..1c16612bed37f 100644 --- a/llvm/test/CodeGen/AMDGPU/ctpop64.ll +++ b/llvm/test/CodeGen/AMDGPU/ctpop64.ll @@ -16,8 +16,8 @@ declare i128 @llvm.ctpop.i128(i128) nounwind readnone define amdgpu_kernel void @s_ctpop_i64(ptr addrspace(1) noalias %out, [8 x i32], i64 %val) nounwind { ; SI-LABEL: s_ctpop_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -28,8 +28,8 @@ define amdgpu_kernel void @s_ctpop_i64(ptr addrspace(1) noalias %out, [8 x i32], ; ; VI-LABEL: s_ctpop_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -46,7 +46,7 @@ define amdgpu_kernel void @s_ctpop_i64(ptr addrspace(1) noalias %out, [8 x i32], define amdgpu_kernel void @v_ctpop_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -66,7 +66,7 @@ define amdgpu_kernel void @v_ctpop_i64(ptr addrspace(1) noalias %out, ptr addrsp ; ; VI-LABEL: v_ctpop_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -92,8 +92,8 @@ define amdgpu_kernel void @v_ctpop_i64(ptr addrspace(1) noalias %out, ptr addrsp define amdgpu_kernel void @v_ctpop_i64_user(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i64 %s.val) nounwind { ; SI-LABEL: v_ctpop_i64_user: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s3 @@ -115,8 +115,8 @@ define amdgpu_kernel void @v_ctpop_i64_user(ptr addrspace(1) noalias %out, ptr a ; ; VI-LABEL: v_ctpop_i64_user: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -144,8 +144,8 @@ define amdgpu_kernel void @v_ctpop_i64_user(ptr addrspace(1) noalias %out, ptr a define amdgpu_kernel void @s_ctpop_v2i64(ptr addrspace(1) noalias %out, <2 x i64> %val) nounwind { ; SI-LABEL: s_ctpop_v2i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -158,8 +158,8 @@ define amdgpu_kernel void @s_ctpop_v2i64(ptr addrspace(1) noalias %out, <2 x i64 ; ; VI-LABEL: s_ctpop_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -178,38 +178,38 @@ define amdgpu_kernel void @s_ctpop_v2i64(ptr addrspace(1) noalias %out, <2 x i64 define amdgpu_kernel void @s_ctpop_v4i64(ptr addrspace(1) noalias %out, <4 x i64> %val) nounwind { ; SI-LABEL: s_ctpop_v4i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x11 +; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s15, 0xf000 +; SI-NEXT: s_mov_b32 s14, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; SI-NEXT: s_bcnt1_i32_b64 s5, s[6:7] -; SI-NEXT: s_bcnt1_i32_b64 s6, s[8:9] -; SI-NEXT: s_bcnt1_i32_b64 s7, s[10:11] -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: s_bcnt1_i32_b64 s0, s[4:5] +; SI-NEXT: s_bcnt1_i32_b64 s1, s[6:7] +; SI-NEXT: s_bcnt1_i32_b64 s2, s[8:9] +; SI-NEXT: s_bcnt1_i32_b64 s3, s[10:11] +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: v_mov_b32_e32 v3, s3 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_ctpop_v4i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 +; VI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x24 +; VI-NEXT: s_mov_b32 s15, 0xf000 +; VI-NEXT: s_mov_b32 s14, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; VI-NEXT: s_bcnt1_i32_b64 s5, s[6:7] -; VI-NEXT: s_bcnt1_i32_b64 s6, s[8:9] -; VI-NEXT: s_bcnt1_i32_b64 s7, s[10:11] -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: s_bcnt1_i32_b64 s0, s[4:5] +; VI-NEXT: s_bcnt1_i32_b64 s1, s[6:7] +; VI-NEXT: s_bcnt1_i32_b64 s2, s[8:9] +; VI-NEXT: s_bcnt1_i32_b64 s3, s[10:11] +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 ; VI-NEXT: s_endpgm %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone %truncctpop = trunc <4 x i64> %ctpop to <4 x i32> @@ -220,7 +220,7 @@ define amdgpu_kernel void @s_ctpop_v4i64(ptr addrspace(1) noalias %out, <4 x i64 define amdgpu_kernel void @v_ctpop_v2i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_v2i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -242,7 +242,7 @@ define amdgpu_kernel void @v_ctpop_v2i64(ptr addrspace(1) noalias %out, ptr addr ; ; VI-LABEL: v_ctpop_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -270,7 +270,7 @@ define amdgpu_kernel void @v_ctpop_v2i64(ptr addrspace(1) noalias %out, ptr addr define amdgpu_kernel void @v_ctpop_v4i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_v4i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -298,7 +298,7 @@ define amdgpu_kernel void @v_ctpop_v4i64(ptr addrspace(1) noalias %out, ptr addr ; ; VI-LABEL: v_ctpop_v4i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 5, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -334,11 +334,11 @@ define amdgpu_kernel void @v_ctpop_v4i64(ptr addrspace(1) noalias %out, ptr addr define amdgpu_kernel void @ctpop_i64_in_br(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %ctpop_arg, i32 %cond) { ; SI-LABEL: ctpop_i64_in_br: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s8, s[0:1], 0xf -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; SI-NEXT: s_load_dword s0, s[2:3], 0xf +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s8, 0 +; SI-NEXT: s_cmp_lg_u32 s0, 0 ; SI-NEXT: s_cbranch_scc0 .LBB7_4 ; SI-NEXT: ; %bb.1: ; %else ; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 @@ -363,11 +363,11 @@ define amdgpu_kernel void @ctpop_i64_in_br(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: ctpop_i64_in_br: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s8, s[0:1], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dword s0, s[2:3], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s8, 0 +; VI-NEXT: s_cmp_lg_u32 s0, 0 ; VI-NEXT: s_cbranch_scc0 .LBB7_4 ; VI-NEXT: ; %bb.1: ; %else ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 @@ -409,8 +409,8 @@ endif: define amdgpu_kernel void @s_ctpop_i128(ptr addrspace(1) noalias %out, i128 %val) nounwind { ; SI-LABEL: s_ctpop_i128: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -423,8 +423,8 @@ define amdgpu_kernel void @s_ctpop_i128(ptr addrspace(1) noalias %out, i128 %val ; ; VI-LABEL: s_ctpop_i128: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -443,8 +443,8 @@ define amdgpu_kernel void @s_ctpop_i128(ptr addrspace(1) noalias %out, i128 %val define amdgpu_kernel void @s_ctpop_i65(ptr addrspace(1) noalias %out, i65 %val) nounwind { ; SI-LABEL: s_ctpop_i65: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dword s8, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -460,8 +460,8 @@ define amdgpu_kernel void @s_ctpop_i65(ptr addrspace(1) noalias %out, i65 %val) ; ; VI-LABEL: s_ctpop_i65: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s8, s[2:3], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -484,7 +484,7 @@ define amdgpu_kernel void @s_ctpop_i65(ptr addrspace(1) noalias %out, i65 %val) define amdgpu_kernel void @v_ctpop_i128(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_i128: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -507,7 +507,7 @@ define amdgpu_kernel void @v_ctpop_i128(ptr addrspace(1) noalias %out, ptr addrs ; ; VI-LABEL: v_ctpop_i128: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 diff --git a/llvm/test/CodeGen/AMDGPU/cttz.ll b/llvm/test/CodeGen/AMDGPU/cttz.ll index ec532c8e4adc3..57fe6cd4e1e45 100644 --- a/llvm/test/CodeGen/AMDGPU/cttz.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz.ll @@ -22,11 +22,11 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone define amdgpu_kernel void @s_cttz_i32(ptr addrspace(1) noalias %out, i32 %val) nounwind { ; SI-LABEL: s_cttz_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_ff1_i32_b32 s2, s2 +; SI-NEXT: s_ff1_i32_b32 s2, s4 ; SI-NEXT: s_min_u32 s4, s2, 32 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -35,8 +35,8 @@ define amdgpu_kernel void @s_cttz_i32(ptr addrspace(1) noalias %out, i32 %val) n ; ; VI-LABEL: s_cttz_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -61,27 +61,27 @@ define amdgpu_kernel void @s_cttz_i32(ptr addrspace(1) noalias %out, i32 %val) n ; GFX10-LABEL: s_cttz_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_ff1_i32_b32 s0, s4 -; GFX10-NEXT: s_min_u32 s0, s0, 32 -; GFX10-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-NEXT: s_ff1_i32_b32 s2, s4 +; GFX10-NEXT: s_min_u32 s2, s2, 32 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: s_cttz_i32: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: s_ff1_i32_b32 s0, s4 -; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 32 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10-GISEL-NEXT: s_ff1_i32_b32 s2, s4 +; GFX10-GISEL-NEXT: s_min_u32 s2, s2, 32 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone store i32 %cttz, ptr addrspace(1) %out, align 4 @@ -91,7 +91,7 @@ define amdgpu_kernel void @s_cttz_i32(ptr addrspace(1) noalias %out, i32 %val) n define amdgpu_kernel void @v_cttz_i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -111,7 +111,7 @@ define amdgpu_kernel void @v_cttz_i32(ptr addrspace(1) noalias %out, ptr addrspa ; ; VI-LABEL: v_cttz_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -148,7 +148,7 @@ define amdgpu_kernel void @v_cttz_i32(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX10-LABEL: v_cttz_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -161,7 +161,7 @@ define amdgpu_kernel void @v_cttz_i32(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX10-GISEL-LABEL: v_cttz_i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -182,7 +182,7 @@ define amdgpu_kernel void @v_cttz_i32(ptr addrspace(1) noalias %out, ptr addrspa define amdgpu_kernel void @v_cttz_v2i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -204,7 +204,7 @@ define amdgpu_kernel void @v_cttz_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; VI-LABEL: v_cttz_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -246,7 +246,7 @@ define amdgpu_kernel void @v_cttz_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX10-LABEL: v_cttz_v2i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -261,7 +261,7 @@ define amdgpu_kernel void @v_cttz_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX10-GISEL-LABEL: v_cttz_v2i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -284,7 +284,7 @@ define amdgpu_kernel void @v_cttz_v2i32(ptr addrspace(1) noalias %out, ptr addrs define amdgpu_kernel void @v_cttz_v4i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 @@ -310,7 +310,7 @@ define amdgpu_kernel void @v_cttz_v4i32(ptr addrspace(1) noalias %out, ptr addrs ; ; VI-LABEL: v_cttz_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -362,7 +362,7 @@ define amdgpu_kernel void @v_cttz_v4i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX10-LABEL: v_cttz_v4i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -381,7 +381,7 @@ define amdgpu_kernel void @v_cttz_v4i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX10-GISEL-LABEL: v_cttz_v4i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -408,7 +408,7 @@ define amdgpu_kernel void @v_cttz_v4i32(ptr addrspace(1) noalias %out, ptr addrs define amdgpu_kernel void @v_cttz_i8(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -427,7 +427,7 @@ define amdgpu_kernel void @v_cttz_i8(ptr addrspace(1) noalias %out, ptr addrspac ; ; VI-LABEL: v_cttz_i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -475,7 +475,7 @@ define amdgpu_kernel void @v_cttz_i8(ptr addrspace(1) noalias %out, ptr addrspac ; ; GFX10-LABEL: v_cttz_i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v1, v0, s[6:7] @@ -487,7 +487,7 @@ define amdgpu_kernel void @v_cttz_i8(ptr addrspace(1) noalias %out, ptr addrspac ; ; GFX10-GISEL-LABEL: v_cttz_i8: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] @@ -505,8 +505,8 @@ define amdgpu_kernel void @v_cttz_i8(ptr addrspace(1) noalias %out, ptr addrspac define amdgpu_kernel void @s_cttz_i64(ptr addrspace(1) noalias %out, [8 x i32], i64 %val) nounwind { ; SI-LABEL: s_cttz_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -519,8 +519,8 @@ define amdgpu_kernel void @s_cttz_i64(ptr addrspace(1) noalias %out, [8 x i32], ; ; VI-LABEL: s_cttz_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v1, 0 @@ -552,11 +552,11 @@ define amdgpu_kernel void @s_cttz_i64(ptr addrspace(1) noalias %out, [8 x i32], ; GFX10-LABEL: s_cttz_i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_ff1_i32_b64 s0, s[2:3] +; GFX10-NEXT: s_ff1_i32_b64 s0, s[0:1] ; GFX10-NEXT: s_min_u32 s0, s0, 64 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] @@ -565,12 +565,12 @@ define amdgpu_kernel void @s_cttz_i64(ptr addrspace(1) noalias %out, [8 x i32], ; GFX10-GISEL-LABEL: s_cttz_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX10-GISEL-NEXT: s_mov_b32 s1, 0 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: s_ff1_i32_b64 s0, s[2:3] +; GFX10-GISEL-NEXT: s_ff1_i32_b64 s0, s[0:1] +; GFX10-GISEL-NEXT: s_mov_b32 s1, 0 ; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 64 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1 @@ -584,7 +584,7 @@ define amdgpu_kernel void @s_cttz_i64(ptr addrspace(1) noalias %out, [8 x i32], define amdgpu_kernel void @s_cttz_i64_trunc(ptr addrspace(1) noalias %out, i64 %val) nounwind { ; SI-LABEL: s_cttz_i64_trunc: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_ff1_i32_b64 s2, s[2:3] @@ -598,7 +598,7 @@ define amdgpu_kernel void @s_cttz_i64_trunc(ptr addrspace(1) noalias %out, i64 % ; ; VI-LABEL: s_cttz_i64_trunc: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -629,7 +629,7 @@ define amdgpu_kernel void @s_cttz_i64_trunc(ptr addrspace(1) noalias %out, i64 % ; ; GFX10-LABEL: s_cttz_i64_trunc: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_ff1_i32_b64 s0, s[6:7] @@ -640,7 +640,7 @@ define amdgpu_kernel void @s_cttz_i64_trunc(ptr addrspace(1) noalias %out, i64 % ; ; GFX10-GISEL-LABEL: s_cttz_i64_trunc: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_ff1_i32_b64 s0, s[6:7] @@ -657,7 +657,7 @@ define amdgpu_kernel void @s_cttz_i64_trunc(ptr addrspace(1) noalias %out, i64 % define amdgpu_kernel void @v_cttz_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_cttz_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -678,7 +678,7 @@ define amdgpu_kernel void @v_cttz_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; VI-LABEL: v_cttz_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -726,7 +726,7 @@ define amdgpu_kernel void @v_cttz_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX10-LABEL: v_cttz_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] @@ -741,7 +741,7 @@ define amdgpu_kernel void @v_cttz_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX10-GISEL-LABEL: v_cttz_i64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] @@ -766,7 +766,7 @@ define amdgpu_kernel void @v_cttz_i64(ptr addrspace(1) noalias %out, ptr addrspa define amdgpu_kernel void @v_cttz_i64_trunc(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_cttz_i64_trunc: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 @@ -787,7 +787,7 @@ define amdgpu_kernel void @v_cttz_i64_trunc(ptr addrspace(1) noalias %out, ptr a ; ; VI-LABEL: v_cttz_i64_trunc: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -835,7 +835,7 @@ define amdgpu_kernel void @v_cttz_i64_trunc(ptr addrspace(1) noalias %out, ptr a ; ; GFX10-LABEL: v_cttz_i64_trunc: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -850,7 +850,7 @@ define amdgpu_kernel void @v_cttz_i64_trunc(ptr addrspace(1) noalias %out, ptr a ; ; GFX10-GISEL-LABEL: v_cttz_i64_trunc: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -876,7 +876,7 @@ define amdgpu_kernel void @v_cttz_i64_trunc(ptr addrspace(1) noalias %out, ptr a define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_i32_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -895,7 +895,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_cttz_i32_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -933,7 +933,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-LABEL: v_cttz_i32_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -945,7 +945,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-GISEL-LABEL: v_cttz_i32_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[6:7] @@ -970,7 +970,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_i32_sel_ne_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -989,7 +989,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_cttz_i32_sel_ne_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1027,7 +1027,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-LABEL: v_cttz_i32_sel_ne_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1039,7 +1039,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-GISEL-LABEL: v_cttz_i32_sel_ne_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[6:7] @@ -1065,7 +1065,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_cttz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_i32_sel_eq_bitwidth: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1087,7 +1087,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_cttz_i32_sel_eq_bitwidth: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1130,7 +1130,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_cttz_i32_sel_eq_bitwidth: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1145,7 +1145,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-GISEL-LABEL: v_cttz_i32_sel_eq_bitwidth: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1170,7 +1170,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias % define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_i32_sel_ne_bitwidth: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1192,7 +1192,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_cttz_i32_sel_ne_bitwidth: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1235,7 +1235,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_cttz_i32_sel_ne_bitwidth: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1250,7 +1250,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-GISEL-LABEL: v_cttz_i32_sel_ne_bitwidth: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1275,7 +1275,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % define amdgpu_kernel void @v_cttz_i8_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_i8_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s10, 0 @@ -1293,7 +1293,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_cttz_i8_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -1335,7 +1335,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_cttz_i8_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v0, v0, s[6:7] @@ -1346,7 +1346,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-GISEL-LABEL: v_cttz_i8_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s6 @@ -1375,7 +1375,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % define amdgpu_kernel void @v_cttz_i16_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_i16_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1393,7 +1393,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_cttz_i16_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1442,7 +1442,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_cttz_i16_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] @@ -1456,7 +1456,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-GISEL-LABEL: v_cttz_i16_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_ushort v1, v0, s[6:7] @@ -1480,7 +1480,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % define amdgpu_kernel void @v_cttz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_i7_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s10, 0 @@ -1499,7 +1499,7 @@ define amdgpu_kernel void @v_cttz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_cttz_i7_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -1542,7 +1542,7 @@ define amdgpu_kernel void @v_cttz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-LABEL: v_cttz_i7_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v0, v0, s[6:7] @@ -1554,7 +1554,7 @@ define amdgpu_kernel void @v_cttz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-GISEL-LABEL: v_cttz_i7_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s6 diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll index 086d99916ba04..81ed823bad204 100644 --- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll @@ -16,11 +16,11 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone define amdgpu_kernel void @s_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, i32 %val) nounwind { ; SI-LABEL: s_cttz_zero_undef_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_ff1_i32_b32 s4, s2 +; SI-NEXT: s_ff1_i32_b32 s4, s4 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -28,10 +28,10 @@ define amdgpu_kernel void @s_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, ; ; VI-LABEL: s_cttz_zero_undef_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ff1_i32_b32 s2, s2 +; VI-NEXT: s_ff1_i32_b32 s2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -51,13 +51,13 @@ define amdgpu_kernel void @s_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, ; ; GFX9-GISEL-LABEL: s_cttz_zero_undef_i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_ff1_i32_b32 s0, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[2:3] +; GFX9-GISEL-NEXT: s_ff1_i32_b32 s2, s4 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone store i32 %cttz, ptr addrspace(1) %out, align 4 @@ -67,7 +67,7 @@ define amdgpu_kernel void @s_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_zero_undef_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -86,7 +86,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_cttz_zero_undef_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -121,7 +121,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -141,7 +141,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_cttz_zero_undef_v2i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_zero_undef_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -161,7 +161,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_v2i32(ptr addrspace(1) noalias %out ; ; VI-LABEL: v_cttz_zero_undef_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -198,7 +198,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_v2i32(ptr addrspace(1) noalias %out ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_v2i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -219,7 +219,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_v2i32(ptr addrspace(1) noalias %out define amdgpu_kernel void @v_cttz_zero_undef_v4i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_zero_undef_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 @@ -241,7 +241,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_v4i32(ptr addrspace(1) noalias %out ; ; VI-LABEL: v_cttz_zero_undef_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -282,7 +282,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_v4i32(ptr addrspace(1) noalias %out ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_v4i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -305,11 +305,11 @@ define amdgpu_kernel void @v_cttz_zero_undef_v4i32(ptr addrspace(1) noalias %out define amdgpu_kernel void @s_cttz_zero_undef_i8_with_select(ptr addrspace(1) noalias %out, i8 %val) nounwind { ; SI-LABEL: s_cttz_zero_undef_i8_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_ff1_i32_b32 s4, s2 +; SI-NEXT: s_ff1_i32_b32 s4, s4 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 @@ -317,10 +317,10 @@ define amdgpu_kernel void @s_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa ; ; VI-LABEL: s_cttz_zero_undef_i8_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ff1_i32_b32 s2, s2 +; VI-NEXT: s_ff1_i32_b32 s2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -356,13 +356,13 @@ define amdgpu_kernel void @s_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa ; ; GFX9-GISEL-LABEL: s_cttz_zero_undef_i8_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_ff1_i32_b32 s0, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[2:3] +; GFX9-GISEL-NEXT: s_ff1_i32_b32 s2, s4 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %cttz = tail call i8 @llvm.cttz.i8(i8 %val, i1 true) nounwind readnone %cttz_ret = icmp ne i8 %val, 0 @@ -374,11 +374,11 @@ define amdgpu_kernel void @s_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa define amdgpu_kernel void @s_cttz_zero_undef_i16_with_select(ptr addrspace(1) noalias %out, i16 %val) nounwind { ; SI-LABEL: s_cttz_zero_undef_i16_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_ff1_i32_b32 s4, s2 +; SI-NEXT: s_ff1_i32_b32 s4, s4 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -386,10 +386,10 @@ define amdgpu_kernel void @s_cttz_zero_undef_i16_with_select(ptr addrspace(1) no ; ; VI-LABEL: s_cttz_zero_undef_i16_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ff1_i32_b32 s2, s2 +; VI-NEXT: s_ff1_i32_b32 s2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -425,13 +425,13 @@ define amdgpu_kernel void @s_cttz_zero_undef_i16_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: s_cttz_zero_undef_i16_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_ff1_i32_b32 s0, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-GISEL-NEXT: global_store_short v1, v0, s[2:3] +; GFX9-GISEL-NEXT: s_ff1_i32_b32 s2, s4 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-GISEL-NEXT: global_store_short v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %cttz = tail call i16 @llvm.cttz.i16(i16 %val, i1 true) nounwind readnone %cttz_ret = icmp ne i16 %val, 0 @@ -443,11 +443,11 @@ define amdgpu_kernel void @s_cttz_zero_undef_i16_with_select(ptr addrspace(1) no define amdgpu_kernel void @s_cttz_zero_undef_i32_with_select(ptr addrspace(1) noalias %out, i32 %val) nounwind { ; SI-LABEL: s_cttz_zero_undef_i32_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_ff1_i32_b32 s4, s2 +; SI-NEXT: s_ff1_i32_b32 s4, s4 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -455,10 +455,10 @@ define amdgpu_kernel void @s_cttz_zero_undef_i32_with_select(ptr addrspace(1) no ; ; VI-LABEL: s_cttz_zero_undef_i32_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ff1_i32_b32 s2, s2 +; VI-NEXT: s_ff1_i32_b32 s2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -478,13 +478,13 @@ define amdgpu_kernel void @s_cttz_zero_undef_i32_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: s_cttz_zero_undef_i32_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_ff1_i32_b32 s0, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[2:3] +; GFX9-GISEL-NEXT: s_ff1_i32_b32 s2, s4 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %cttz = tail call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone %cttz_ret = icmp ne i32 %val, 0 @@ -496,7 +496,7 @@ define amdgpu_kernel void @s_cttz_zero_undef_i32_with_select(ptr addrspace(1) no define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(ptr addrspace(1) noalias %out, i64 %val) nounwind { ; SI-LABEL: s_cttz_zero_undef_i64_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -510,7 +510,7 @@ define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(ptr addrspace(1) no ; ; VI-LABEL: s_cttz_zero_undef_i64_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ff1_i32_b64 s2, s[2:3] @@ -538,7 +538,7 @@ define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: s_cttz_zero_undef_i64_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-GISEL-NEXT: s_mov_b32 s1, 0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -557,7 +557,7 @@ define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(ptr addrspace(1) no define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_cttz_zero_undef_i8_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -577,7 +577,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa ; ; VI-LABEL: v_cttz_zero_undef_i8_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -622,7 +622,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i8_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] @@ -644,7 +644,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_cttz_zero_undef_i16_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -668,7 +668,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(ptr addrspace(1) no ; ; VI-LABEL: v_cttz_zero_undef_i16_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 1 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -721,7 +721,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i16_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] @@ -745,7 +745,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(ptr addrspace(1) no define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_cttz_zero_undef_i32_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -776,7 +776,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) no ; ; VI-LABEL: v_cttz_zero_undef_i32_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 3 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -836,7 +836,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i32_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] @@ -866,7 +866,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) no define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_cttz_zero_undef_i64_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s10, s2 @@ -913,7 +913,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no ; ; VI-LABEL: v_cttz_zero_undef_i64_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 5 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -1017,7 +1017,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i64_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v0, v1, s[6:7] @@ -1061,7 +1061,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_cttz_i32_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1091,7 +1091,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_cttz_i32_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 3 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -1152,7 +1152,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; GFX9-GISEL-LABEL: v_cttz_i32_sel_eq_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] @@ -1183,7 +1183,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_cttz_i32_sel_ne_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1213,7 +1213,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_cttz_i32_sel_ne_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 3 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -1274,7 +1274,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; ; GFX9-GISEL-LABEL: v_cttz_i32_sel_ne_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] @@ -1305,7 +1305,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_cttz_i32_sel_ne_bitwidth: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1338,7 +1338,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_cttz_i32_sel_ne_bitwidth: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 3 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -1404,7 +1404,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX9-GISEL-LABEL: v_cttz_i32_sel_ne_bitwidth: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] @@ -1435,7 +1435,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % define amdgpu_kernel void @v_cttz_i8_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_cttz_i8_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1453,7 +1453,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_cttz_i8_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1498,7 +1498,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX9-GISEL-LABEL: v_cttz_i8_sel_eq_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xff ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1522,7 +1522,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % define amdgpu_kernel void @v_cttz_i16_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_cttz_i16_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1544,7 +1544,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_cttz_i16_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 1 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -1597,7 +1597,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX9-GISEL-LABEL: v_cttz_i16_sel_eq_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll index 4226728dbe118..eb959e30b87f4 100644 --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -900,7 +900,7 @@ define double @v_uitofp_i8_to_f64(i8 %arg0) nounwind { define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_i8_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s10, 0 @@ -918,7 +918,7 @@ define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr add ; ; VI-LABEL: load_i8_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -933,7 +933,7 @@ define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr add ; ; GFX10-LABEL: load_i8_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v0, v0, s[6:7] @@ -955,8 +955,8 @@ define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr add ; ; GFX11-LABEL: load_i8_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -976,7 +976,7 @@ define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr add define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v2i8_to_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -996,7 +996,7 @@ define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v2i8_to_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1013,7 +1013,7 @@ define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr ; ; GFX10-LABEL: load_v2i8_to_v2f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1039,9 +1039,11 @@ define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr ; ; GFX11-LABEL: load_v2i8_to_v2f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1062,7 +1064,7 @@ define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v3i8_to_v3f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -1084,7 +1086,7 @@ define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v3i8_to_v3f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1102,7 +1104,7 @@ define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr ; ; GFX10-LABEL: load_v3i8_to_v3f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1130,8 +1132,10 @@ define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr ; ; GFX11-LABEL: load_v3i8_to_v3f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1153,7 +1157,7 @@ define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v4i8_to_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -1175,7 +1179,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v4i8_to_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1194,7 +1198,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr ; ; GFX10-LABEL: load_v4i8_to_v4f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1224,9 +1228,11 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr ; ; GFX11-LABEL: load_v4i8_to_v4f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1253,7 +1259,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v4i8_to_v4f32_unaligned: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -1281,7 +1287,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias ; ; VI-LABEL: load_v4i8_to_v4f32_unaligned: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1312,7 +1318,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias ; ; GFX10-LABEL: load_v4i8_to_v4f32_unaligned: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1355,8 +1361,10 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias ; ; GFX11-LABEL: load_v4i8_to_v4f32_unaligned: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: global_load_u8 v1, v0, s[2:3] offset:3 @@ -1388,7 +1396,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %out1, ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %in1) nounwind { ; SI-LABEL: load_v4i8_to_v4f32_unaligned_multiuse: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 @@ -1426,7 +1434,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1 ; ; VI-LABEL: load_v4i8_to_v4f32_unaligned_multiuse: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s8, 0x4000405 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1473,7 +1481,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1 ; ; GFX10-LABEL: load_v4i8_to_v4f32_unaligned_multiuse: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v7, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1521,9 +1529,11 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1 ; ; GFX11-LABEL: load_v4i8_to_v4f32_unaligned_multiuse: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v6, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: global_load_u8 v1, v0, s[4:5] offset:2 @@ -1563,21 +1573,21 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1 define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %out2, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v4i8_to_v4f32_2_uses: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 +; SI-NEXT: s_mov_b32 s2, s6 +; SI-NEXT: s_mov_b32 s3, s7 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; SI-NEXT: v_and_b32_e32 v6, 0xff00, v4 @@ -1586,7 +1596,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v4 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 9, v4 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v0, 0xff, v4 ; SI-NEXT: v_add_i32_e32 v2, vcc, 9, v5 @@ -1599,22 +1609,22 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x9000000, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: load_v4i8_to_v4f32_2_uses: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: v_mov_b32_e32 v5, 0xffffff00 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v6, 9 ; VI-NEXT: v_mov_b32_e32 v7, 0x900 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1644,11 +1654,11 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; GFX10-LABEL: load_v4i8_to_v4f32_2_uses: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff00, v0 @@ -1702,11 +1712,13 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; ; GFX11-LABEL: load_v4i8_to_v4f32_2_uses: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: v_add_nc_u16 v2, v0, 9 @@ -1753,7 +1765,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v7i8_to_v7f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -1792,7 +1804,7 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v7i8_to_v7f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1837,7 +1849,7 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr ; ; GFX10-LABEL: load_v7i8_to_v7f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1896,9 +1908,11 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr ; ; GFX11-LABEL: load_v7i8_to_v7f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v8, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x5 ; GFX11-NEXT: global_load_u8 v4, v0, s[2:3] offset:6 @@ -1937,7 +1951,7 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v8i8_to_v8f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -1964,7 +1978,7 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v8i8_to_v8f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1988,7 +2002,7 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr ; ; GFX10-LABEL: load_v8i8_to_v8f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: v_mov_b32_e32 v10, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2028,9 +2042,11 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr ; ; GFX11-LABEL: load_v8i8_to_v8f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v10, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[8:9], v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2059,7 +2075,7 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: i8_zext_inreg_i32_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -2079,7 +2095,7 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %ou ; ; VI-LABEL: i8_zext_inreg_i32_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2096,7 +2112,7 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %ou ; ; GFX10-LABEL: i8_zext_inreg_i32_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2122,13 +2138,14 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %ou ; ; GFX11-LABEL: i8_zext_inreg_i32_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_nc_u32_e32 v0, 2, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -2147,7 +2164,7 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %ou define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: i8_zext_inreg_hi1_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -2166,7 +2183,7 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %ou ; ; VI-LABEL: i8_zext_inreg_hi1_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2182,7 +2199,7 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %ou ; ; GFX10-LABEL: i8_zext_inreg_hi1_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2206,8 +2223,10 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %ou ; ; GFX11-LABEL: i8_zext_inreg_hi1_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2231,7 +2250,7 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %ou define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: i8_zext_i32_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s10, 0 @@ -2249,7 +2268,7 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: i8_zext_i32_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -2264,7 +2283,7 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr ; ; GFX10-LABEL: i8_zext_i32_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v0, v0, s[6:7] @@ -2286,8 +2305,8 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr ; ; GFX11-LABEL: i8_zext_i32_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2308,7 +2327,7 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v4i8_zext_v4i32_to_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -2336,7 +2355,7 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou ; ; VI-LABEL: v4i8_zext_v4i32_to_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2367,7 +2386,7 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou ; ; GFX10-LABEL: v4i8_zext_v4i32_to_v4f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2410,8 +2429,10 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou ; ; GFX11-LABEL: v4i8_zext_v4i32_to_v4f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: global_load_u8 v1, v0, s[2:3] offset:3 @@ -2442,7 +2463,7 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: extract_byte0_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -2461,7 +2482,7 @@ define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, p ; ; VI-LABEL: extract_byte0_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2477,7 +2498,7 @@ define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, p ; ; GFX10-LABEL: extract_byte0_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2501,8 +2522,10 @@ define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, p ; ; GFX11-LABEL: extract_byte0_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2523,7 +2546,7 @@ define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: extract_byte1_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -2542,7 +2565,7 @@ define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, p ; ; VI-LABEL: extract_byte1_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2558,7 +2581,7 @@ define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, p ; ; GFX10-LABEL: extract_byte1_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2582,8 +2605,10 @@ define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, p ; ; GFX11-LABEL: extract_byte1_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2605,7 +2630,7 @@ define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: extract_byte2_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -2624,7 +2649,7 @@ define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, p ; ; VI-LABEL: extract_byte2_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2640,7 +2665,7 @@ define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, p ; ; GFX10-LABEL: extract_byte2_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2664,8 +2689,10 @@ define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, p ; ; GFX11-LABEL: extract_byte2_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2687,7 +2714,7 @@ define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: extract_byte3_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -2706,7 +2733,7 @@ define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, p ; ; VI-LABEL: extract_byte3_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2722,7 +2749,7 @@ define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, p ; ; GFX10-LABEL: extract_byte3_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2746,8 +2773,10 @@ define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, p ; ; GFX11-LABEL: extract_byte3_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2769,7 +2798,7 @@ define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: cvt_ubyte0_or_multiuse: ; SI: ; %bb.0: ; %bb -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 @@ -2789,7 +2818,7 @@ define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addr ; ; VI-LABEL: cvt_ubyte0_or_multiuse: ; VI: ; %bb.0: ; %bb -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -2809,7 +2838,7 @@ define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addr ; ; GFX10-LABEL: cvt_ubyte0_or_multiuse: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2837,15 +2866,17 @@ define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addr ; ; GFX11-LABEL: cvt_ubyte0_or_multiuse: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_or_b32_e32 v0, 0x80000001, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX11-NEXT: global_store_b32 v2, v0, s[2:3] ; GFX11-NEXT: s_nop 0 diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll index 739fff5084135..6db65752db54f 100644 --- a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll +++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll @@ -7,11 +7,11 @@ define amdgpu_kernel void @uniform_vec_0_i16(ptr addrspace(1) %out, i16 %a) { ; GCN-LABEL: uniform_vec_0_i16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s2, s[0:1], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s4, s[2:3], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s4, s2, 16 +; GCN-NEXT: s_lshl_b32 s4, s4, 16 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -19,33 +19,33 @@ define amdgpu_kernel void @uniform_vec_0_i16(ptr addrspace(1) %out, i16 %a) { ; ; GFX9-LABEL: uniform_vec_0_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s0, s4, 16 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: s_lshl_b32 s2, s4, 16 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX906-LABEL: uniform_vec_0_i16: ; GFX906: ; %bb.0: -; GFX906-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX906-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: s_lshl_b32 s0, s4, 16 -; GFX906-NEXT: v_mov_b32_e32 v1, s0 -; GFX906-NEXT: global_store_dword v0, v1, s[2:3] +; GFX906-NEXT: s_lshl_b32 s2, s4, 16 +; GFX906-NEXT: v_mov_b32_e32 v1, s2 +; GFX906-NEXT: global_store_dword v0, v1, s[0:1] ; GFX906-NEXT: s_endpgm ; ; GFX11-LABEL: uniform_vec_0_i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-NEXT: s_lshl_b32 s2, s4, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -92,11 +92,11 @@ define i32 @divergent_vec_0_i16(i16 %a) { define amdgpu_kernel void @uniform_vec_i16_0(ptr addrspace(1) %out, i16 %a) { ; GCN-LABEL: uniform_vec_i16_0: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s2, s[0:1], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s4, s[2:3], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_and_b32 s4, s2, 0xffff +; GCN-NEXT: s_and_b32 s4, s4, 0xffff ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -104,33 +104,33 @@ define amdgpu_kernel void @uniform_vec_i16_0(ptr addrspace(1) %out, i16 %a) { ; ; GFX9-LABEL: uniform_vec_i16_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, 0xffff, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: s_and_b32 s2, 0xffff, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX906-LABEL: uniform_vec_i16_0: ; GFX906: ; %bb.0: -; GFX906-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX906-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: s_and_b32 s0, 0xffff, s4 -; GFX906-NEXT: v_mov_b32_e32 v1, s0 -; GFX906-NEXT: global_store_dword v0, v1, s[2:3] +; GFX906-NEXT: s_and_b32 s2, 0xffff, s4 +; GFX906-NEXT: v_mov_b32_e32 v1, s2 +; GFX906-NEXT: global_store_dword v0, v1, s[0:1] ; GFX906-NEXT: s_endpgm ; ; GFX11-LABEL: uniform_vec_i16_0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX11-NEXT: s_and_b32 s2, 0xffff, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -177,11 +177,11 @@ define i32 @divergent_vec_i16_0(i16 %a) { define amdgpu_kernel void @uniform_vec_f16_0(ptr addrspace(1) %out, half %a) { ; GCN-LABEL: uniform_vec_f16_0: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s2, s[0:1], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s4, s[2:3], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_and_b32 s4, s2, 0xffff +; GCN-NEXT: s_and_b32 s4, s4, 0xffff ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -189,33 +189,33 @@ define amdgpu_kernel void @uniform_vec_f16_0(ptr addrspace(1) %out, half %a) { ; ; GFX9-LABEL: uniform_vec_f16_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, 0xffff, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: s_and_b32 s2, 0xffff, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX906-LABEL: uniform_vec_f16_0: ; GFX906: ; %bb.0: -; GFX906-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX906-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: s_and_b32 s0, 0xffff, s4 -; GFX906-NEXT: v_mov_b32_e32 v1, s0 -; GFX906-NEXT: global_store_dword v0, v1, s[2:3] +; GFX906-NEXT: s_and_b32 s2, 0xffff, s4 +; GFX906-NEXT: v_mov_b32_e32 v1, s2 +; GFX906-NEXT: global_store_dword v0, v1, s[0:1] ; GFX906-NEXT: s_endpgm ; ; GFX11-LABEL: uniform_vec_f16_0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX11-NEXT: s_and_b32 s2, 0xffff, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -262,7 +262,7 @@ define float @divergent_vec_f16_0(half %a) { define amdgpu_kernel void @uniform_vec_i16_LL(ptr addrspace(4) %in0, ptr addrspace(4) %in1) { ; GCN-LABEL: uniform_vec_i16_LL: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dword s0, s[0:1], 0x0 ; GCN-NEXT: s_load_dword s1, s[2:3], 0x0 @@ -277,7 +277,7 @@ define amdgpu_kernel void @uniform_vec_i16_LL(ptr addrspace(4) %in0, ptr addrspa ; ; GFX9-LABEL: uniform_vec_i16_LL: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0 @@ -290,7 +290,7 @@ define amdgpu_kernel void @uniform_vec_i16_LL(ptr addrspace(4) %in0, ptr addrspa ; ; GFX906-LABEL: uniform_vec_i16_LL: ; GFX906: ; %bb.0: -; GFX906-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX906-NEXT: s_load_dword s5, s[2:3], 0x0 @@ -303,7 +303,7 @@ define amdgpu_kernel void @uniform_vec_i16_LL(ptr addrspace(4) %in0, ptr addrspa ; ; GFX11-LABEL: uniform_vec_i16_LL: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_load_b32 s1, s[2:3], 0x0 @@ -361,7 +361,7 @@ define i32 @divergent_vec_i16_LL(i16 %a, i16 %b) { define amdgpu_kernel void @uniform_vec_i16_LH(ptr addrspace(1) %out, i16 %a, i32 %b) { ; GCN-LABEL: uniform_vec_i16_LH: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_and_b32 s3, s3, 0xffff0000 @@ -376,7 +376,7 @@ define amdgpu_kernel void @uniform_vec_i16_LH(ptr addrspace(1) %out, i16 %a, i32 ; ; GFX9-LABEL: uniform_vec_i16_LH: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_pack_lh_b32_b16 s0, s6, s7 @@ -386,7 +386,7 @@ define amdgpu_kernel void @uniform_vec_i16_LH(ptr addrspace(1) %out, i16 %a, i32 ; ; GFX906-LABEL: uniform_vec_i16_LH: ; GFX906: ; %bb.0: -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: s_pack_lh_b32_b16 s0, s6, s7 @@ -396,7 +396,7 @@ define amdgpu_kernel void @uniform_vec_i16_LH(ptr addrspace(1) %out, i16 %a, i32 ; ; GFX11-LABEL: uniform_vec_i16_LH: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_pack_lh_b32_b16 s2, s2, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -452,7 +452,7 @@ define i32 @divergent_vec_i16_LH(i16 %a, i32 %b) { define amdgpu_kernel void @uniform_vec_i16_HH(ptr addrspace(1) %out, i32 %a, i32 %b) { ; GCN-LABEL: uniform_vec_i16_HH: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -466,7 +466,7 @@ define amdgpu_kernel void @uniform_vec_i16_HH(ptr addrspace(1) %out, i32 %a, i32 ; ; GFX9-LABEL: uniform_vec_i16_HH: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_pack_hh_b32_b16 s0, s6, s7 @@ -476,7 +476,7 @@ define amdgpu_kernel void @uniform_vec_i16_HH(ptr addrspace(1) %out, i32 %a, i32 ; ; GFX906-LABEL: uniform_vec_i16_HH: ; GFX906: ; %bb.0: -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: s_pack_hh_b32_b16 s0, s6, s7 @@ -486,7 +486,7 @@ define amdgpu_kernel void @uniform_vec_i16_HH(ptr addrspace(1) %out, i32 %a, i32 ; ; GFX11-LABEL: uniform_vec_i16_HH: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_pack_hh_b32_b16 s2, s2, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -546,7 +546,7 @@ define i32 @divergent_vec_i16_HH(i32 %a, i32 %b) { define amdgpu_kernel void @uniform_vec_f16_LL(ptr addrspace(4) %in0, ptr addrspace(4) %in1) { ; GCN-LABEL: uniform_vec_f16_LL: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dword s0, s[0:1], 0x0 ; GCN-NEXT: s_load_dword s1, s[2:3], 0x0 @@ -561,7 +561,7 @@ define amdgpu_kernel void @uniform_vec_f16_LL(ptr addrspace(4) %in0, ptr addrspa ; ; GFX9-LABEL: uniform_vec_f16_LL: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0 @@ -574,7 +574,7 @@ define amdgpu_kernel void @uniform_vec_f16_LL(ptr addrspace(4) %in0, ptr addrspa ; ; GFX906-LABEL: uniform_vec_f16_LL: ; GFX906: ; %bb.0: -; GFX906-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX906-NEXT: s_load_dword s5, s[2:3], 0x0 @@ -587,7 +587,7 @@ define amdgpu_kernel void @uniform_vec_f16_LL(ptr addrspace(4) %in0, ptr addrspa ; ; GFX11-LABEL: uniform_vec_f16_LL: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_load_b32 s1, s[2:3], 0x0 @@ -684,10 +684,10 @@ entry: define amdgpu_kernel void @build_vec_v2i16_undeflo_uniform(ptr addrspace(3) %in, ptr addrspace(1) %out) #0 { ; GCN-LABEL: build_vec_v2i16_undeflo_uniform: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s2, s[0:1], 0x9 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GCN-NEXT: s_load_dword s4, s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: s_mov_b32 m0, -1 ; GCN-NEXT: ds_read_u16 v0, v0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 @@ -698,35 +698,35 @@ define amdgpu_kernel void @build_vec_v2i16_undeflo_uniform(ptr addrspace(3) %in, ; ; GFX9-LABEL: build_vec_v2i16_undeflo_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: ds_read_u16_d16 v0, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v1, v0, s[2:3] +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX906-LABEL: build_vec_v2i16_undeflo_uniform: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX906-NEXT: s_load_dword s4, s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c ; GFX906-NEXT: v_mov_b32_e32 v1, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: v_mov_b32_e32 v0, s4 ; GFX906-NEXT: ds_read_u16 v0, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_store_dword v1, v0, s[2:3] +; GFX906-NEXT: global_store_dword v1, v0, s[0:1] ; GFX906-NEXT: s_endpgm ; ; GFX11-LABEL: build_vec_v2i16_undeflo_uniform: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x2c +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x2c ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 ; GFX11-NEXT: ds_load_u16_d16 v0, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll index 777a8f3fef1c1..b72cd7e1d1eca 100644 --- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll @@ -15,7 +15,7 @@ define amdgpu_kernel void @simple_read2_f32(ptr addrspace(1) %out) #0 { ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read2_b32 v[1:2], v0 offset1:8 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -28,7 +28,7 @@ define amdgpu_kernel void @simple_read2_f32(ptr addrspace(1) %out) #0 { ; GFX9: ; %bb.0: ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] @@ -51,7 +51,7 @@ define amdgpu_kernel void @simple_read2_f32_max_offset(ptr addrspace(1) %out) #0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read2_b32 v[1:2], v0 offset1:255 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -64,7 +64,7 @@ define amdgpu_kernel void @simple_read2_f32_max_offset(ptr addrspace(1) %out) #0 ; GFX9: ; %bb.0: ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:255 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] @@ -88,7 +88,7 @@ define amdgpu_kernel void @simple_read2_f32_too_far(ptr addrspace(1) %out) #0 { ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read_b32 v1, v0 ; CI-NEXT: ds_read_b32 v2, v0 offset:1028 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -102,7 +102,7 @@ define amdgpu_kernel void @simple_read2_f32_too_far(ptr addrspace(1) %out) #0 { ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: ds_read_b32 v1, v0 ; GFX9-NEXT: ds_read_b32 v2, v0 offset:1028 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] @@ -126,7 +126,7 @@ define amdgpu_kernel void @simple_read2_f32_x2(ptr addrspace(1) %out) #0 { ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read2_b32 v[1:2], v0 offset1:8 ; CI-NEXT: ds_read2_b32 v[3:4], v0 offset0:11 offset1:27 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -142,7 +142,7 @@ define amdgpu_kernel void @simple_read2_f32_x2(ptr addrspace(1) %out) #0 { ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX9-NEXT: ds_read2_b32 v[0:1], v4 offset1:8 ; GFX9-NEXT: ds_read2_b32 v[2:3], v4 offset0:11 offset1:27 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_add_f32_e32 v1, v2, v3 @@ -184,7 +184,7 @@ define amdgpu_kernel void @simple_read2_f32_x2_barrier(ptr addrspace(1) %out) #0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_barrier ; CI-NEXT: ds_read2_b32 v[3:4], v0 offset0:11 offset1:27 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: v_add_f32_e32 v1, v1, v2 ; CI-NEXT: s_mov_b32 s2, 0 @@ -202,7 +202,7 @@ define amdgpu_kernel void @simple_read2_f32_x2_barrier(ptr addrspace(1) %out) #0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_barrier ; GFX9-NEXT: ds_read2_b32 v[2:3], v4 offset0:11 offset1:27 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v1, v2, v3 @@ -245,7 +245,7 @@ define amdgpu_kernel void @simple_read2_f32_x2_nonzero_base(ptr addrspace(1) %ou ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read2_b32 v[1:2], v0 offset0:2 offset1:8 ; CI-NEXT: ds_read2_b32 v[3:4], v0 offset0:11 offset1:27 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -261,7 +261,7 @@ define amdgpu_kernel void @simple_read2_f32_x2_nonzero_base(ptr addrspace(1) %ou ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX9-NEXT: ds_read2_b32 v[0:1], v4 offset0:2 offset1:8 ; GFX9-NEXT: ds_read2_b32 v[2:3], v4 offset0:11 offset1:27 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_add_f32_e32 v1, v2, v3 @@ -301,7 +301,7 @@ define amdgpu_kernel void @simple_read2_f32_x2_nonzero_base(ptr addrspace(1) %ou define amdgpu_kernel void @read2_ptr_is_subreg_arg_f32(ptr addrspace(1) %out, <2 x ptr addrspace(3)> %lds.ptr) #0 { ; CI-LABEL: read2_ptr_is_subreg_arg_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -319,7 +319,7 @@ define amdgpu_kernel void @read2_ptr_is_subreg_arg_f32(ptr addrspace(1) %out, <2 ; ; GFX9-LABEL: read2_ptr_is_subreg_arg_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -352,7 +352,7 @@ define amdgpu_kernel void @read2_ptr_is_subreg_arg_f32(ptr addrspace(1) %out, <2 define amdgpu_kernel void @read2_ptr_is_subreg_arg_offset_f32(ptr addrspace(1) %out, <2 x ptr addrspace(3)> %lds.ptr) #0 { ; CI-LABEL: read2_ptr_is_subreg_arg_offset_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -370,7 +370,7 @@ define amdgpu_kernel void @read2_ptr_is_subreg_arg_offset_f32(ptr addrspace(1) % ; ; GFX9-LABEL: read2_ptr_is_subreg_arg_offset_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -406,7 +406,7 @@ define amdgpu_kernel void @read2_ptr_is_subreg_f32(ptr addrspace(1) %out) #0 { ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read2_b32 v[1:2], v0 offset1:8 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -419,7 +419,7 @@ define amdgpu_kernel void @read2_ptr_is_subreg_f32(ptr addrspace(1) %out) #0 { ; GFX9: ; %bb.0: ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] @@ -449,7 +449,7 @@ define amdgpu_kernel void @simple_read2_f32_volatile_0(ptr addrspace(1) %out) #0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read_b32 v1, v0 ; CI-NEXT: ds_read_b32 v2, v0 offset:32 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -463,7 +463,7 @@ define amdgpu_kernel void @simple_read2_f32_volatile_0(ptr addrspace(1) %out) #0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: ds_read_b32 v1, v0 ; GFX9-NEXT: ds_read_b32 v2, v0 offset:32 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] @@ -487,7 +487,7 @@ define amdgpu_kernel void @simple_read2_f32_volatile_1(ptr addrspace(1) %out) #0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read_b32 v1, v0 ; CI-NEXT: ds_read_b32 v2, v0 offset:32 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -501,7 +501,7 @@ define amdgpu_kernel void @simple_read2_f32_volatile_1(ptr addrspace(1) %out) #0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: ds_read_b32 v1, v0 ; GFX9-NEXT: ds_read_b32 v2, v0 offset:32 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] @@ -522,13 +522,11 @@ define amdgpu_kernel void @simple_read2_f32_volatile_1(ptr addrspace(1) %out) #0 define amdgpu_kernel void @unaligned_read2_f32(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 { ; CI-LABEL: unaligned_read2_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[0:1], 0x2 +; CI-NEXT: s_load_dword s0, s[2:3], 0x2 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_mov_b32 m0, -1 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v0 +; CI-NEXT: v_add_i32_e32 v1, vcc, s0, v0 ; CI-NEXT: ds_read_u8 v2, v1 offset:34 ; CI-NEXT: ds_read_u8 v3, v1 offset:32 ; CI-NEXT: ds_read_u8 v4, v1 offset:3 @@ -537,15 +535,13 @@ define amdgpu_kernel void @unaligned_read2_f32(ptr addrspace(1) %out, ptr addrsp ; CI-NEXT: ds_read_u8 v7, v1 ; CI-NEXT: ds_read_u8 v8, v1 offset:33 ; CI-NEXT: ds_read_u8 v1, v1 offset:35 -; CI-NEXT: s_waitcnt lgkmcnt(5) +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; CI-NEXT: s_waitcnt lgkmcnt(3) ; CI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; CI-NEXT: v_or_b32_e32 v4, v4, v5 -; CI-NEXT: s_waitcnt lgkmcnt(1) -; CI-NEXT: v_lshlrev_b32_e32 v5, 8, v8 -; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; CI-NEXT: v_lshlrev_b32_e32 v5, 8, v8 ; CI-NEXT: v_or_b32_e32 v1, v1, v2 ; CI-NEXT: v_or_b32_e32 v6, v6, v7 ; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 @@ -554,6 +550,7 @@ define amdgpu_kernel void @unaligned_read2_f32(ptr addrspace(1) %out, ptr addrsp ; CI-NEXT: v_or_b32_e32 v4, v4, v6 ; CI-NEXT: v_or_b32_e32 v1, v1, v3 ; CI-NEXT: v_add_f32_e32 v2, v4, v1 +; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 @@ -561,8 +558,8 @@ define amdgpu_kernel void @unaligned_read2_f32(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-ALIGNED-LABEL: unaligned_read2_f32: ; GFX9-ALIGNED: ; %bb.0: -; GFX9-ALIGNED-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX9-ALIGNED-NEXT: s_load_dword s4, s[2:3], 0x8 +; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-ALIGNED-NEXT: v_add_u32_e32 v1, s4, v0 @@ -585,17 +582,17 @@ define amdgpu_kernel void @unaligned_read2_f32(ptr addrspace(1) %out, ptr addrsp ; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v1, v1, 8, v8 ; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v1, v1, 16, v3 ; GFX9-ALIGNED-NEXT: v_add_f32_e32 v1, v2, v1 -; GFX9-ALIGNED-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-ALIGNED-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-ALIGNED-NEXT: s_endpgm ; ; GFX9-UNALIGNED-LABEL: unaligned_read2_f32: ; GFX9-UNALIGNED: ; %bb.0: -; GFX9-UNALIGNED-NEXT: s_load_dword s2, s[0:1], 0x8 +; GFX9-UNALIGNED-NEXT: s_load_dword s0, s[2:3], 0x8 ; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v0, s2, v2 +; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v0, s0, v2 ; GFX9-UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset1:8 +; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-UNALIGNED-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-UNALIGNED-NEXT: global_store_dword v2, v0, s[0:1] @@ -615,13 +612,11 @@ define amdgpu_kernel void @unaligned_read2_f32(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @unaligned_offset_read2_f32(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 { ; CI-LABEL: unaligned_offset_read2_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[0:1], 0x2 +; CI-NEXT: s_load_dword s0, s[2:3], 0x2 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_mov_b32 m0, -1 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v0 +; CI-NEXT: v_add_i32_e32 v1, vcc, s0, v0 ; CI-NEXT: ds_read_u8 v2, v1 offset:11 ; CI-NEXT: ds_read_u8 v3, v1 offset:9 ; CI-NEXT: ds_read_u8 v4, v1 offset:8 @@ -630,15 +625,13 @@ define amdgpu_kernel void @unaligned_offset_read2_f32(ptr addrspace(1) %out, ptr ; CI-NEXT: ds_read_u8 v7, v1 offset:5 ; CI-NEXT: ds_read_u8 v8, v1 offset:10 ; CI-NEXT: ds_read_u8 v1, v1 offset:12 -; CI-NEXT: s_waitcnt lgkmcnt(5) +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; CI-NEXT: s_waitcnt lgkmcnt(3) ; CI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; CI-NEXT: v_or_b32_e32 v4, v4, v5 -; CI-NEXT: s_waitcnt lgkmcnt(1) -; CI-NEXT: v_lshlrev_b32_e32 v5, 8, v8 -; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; CI-NEXT: v_lshlrev_b32_e32 v5, 8, v8 ; CI-NEXT: v_or_b32_e32 v1, v1, v2 ; CI-NEXT: v_or_b32_e32 v6, v6, v7 ; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 @@ -647,6 +640,7 @@ define amdgpu_kernel void @unaligned_offset_read2_f32(ptr addrspace(1) %out, ptr ; CI-NEXT: v_or_b32_e32 v4, v4, v6 ; CI-NEXT: v_or_b32_e32 v1, v1, v3 ; CI-NEXT: v_add_f32_e32 v2, v4, v1 +; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 @@ -654,8 +648,8 @@ define amdgpu_kernel void @unaligned_offset_read2_f32(ptr addrspace(1) %out, ptr ; ; GFX9-ALIGNED-LABEL: unaligned_offset_read2_f32: ; GFX9-ALIGNED: ; %bb.0: -; GFX9-ALIGNED-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX9-ALIGNED-NEXT: s_load_dword s4, s[2:3], 0x8 +; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-ALIGNED-NEXT: v_add_u32_e32 v1, s4, v0 @@ -678,17 +672,17 @@ define amdgpu_kernel void @unaligned_offset_read2_f32(ptr addrspace(1) %out, ptr ; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v1, v1, 8, v8 ; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v1, v1, 16, v3 ; GFX9-ALIGNED-NEXT: v_add_f32_e32 v1, v2, v1 -; GFX9-ALIGNED-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-ALIGNED-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-ALIGNED-NEXT: s_endpgm ; ; GFX9-UNALIGNED-LABEL: unaligned_offset_read2_f32: ; GFX9-UNALIGNED: ; %bb.0: -; GFX9-UNALIGNED-NEXT: s_load_dword s2, s[0:1], 0x8 +; GFX9-UNALIGNED-NEXT: s_load_dword s0, s[2:3], 0x8 ; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v0, s2, v2 +; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v0, s0, v2 ; GFX9-UNALIGNED-NEXT: ds_read_b64 v[0:1], v0 offset:5 +; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-UNALIGNED-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-UNALIGNED-NEXT: global_store_dword v2, v0, s[0:1] @@ -708,44 +702,41 @@ define amdgpu_kernel void @unaligned_offset_read2_f32(ptr addrspace(1) %out, ptr define amdgpu_kernel void @misaligned_2_simple_read2_f32(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 { ; CI-LABEL: misaligned_2_simple_read2_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[0:1], 0x2 +; CI-NEXT: s_load_dword s0, s[2:3], 0x2 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_mov_b32 m0, -1 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v0 +; CI-NEXT: v_add_i32_e32 v1, vcc, s0, v0 ; CI-NEXT: ds_read_u16 v2, v1 offset:32 ; CI-NEXT: ds_read_u16 v3, v1 offset:2 ; CI-NEXT: ds_read_u16 v4, v1 ; CI-NEXT: ds_read_u16 v1, v1 offset:34 -; CI-NEXT: s_mov_b32 s2, 0 -; CI-NEXT: s_waitcnt lgkmcnt(2) +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; CI-NEXT: s_waitcnt lgkmcnt(1) ; CI-NEXT: v_or_b32_e32 v3, v3, v4 -; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v1, v1, v2 ; CI-NEXT: v_add_f32_e32 v2, v3, v1 +; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; CI-NEXT: s_endpgm ; ; GFX9-ALIGNED-LABEL: misaligned_2_simple_read2_f32: ; GFX9-ALIGNED: ; %bb.0: -; GFX9-ALIGNED-NEXT: s_load_dword s2, s[0:1], 0x8 +; GFX9-ALIGNED-NEXT: s_load_dword s0, s[2:3], 0x8 ; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-ALIGNED-NEXT: v_add_u32_e32 v1, s2, v0 +; GFX9-ALIGNED-NEXT: v_add_u32_e32 v1, s0, v0 ; GFX9-ALIGNED-NEXT: ds_read_u16 v2, v1 ; GFX9-ALIGNED-NEXT: ds_read_u16 v3, v1 offset:2 ; GFX9-ALIGNED-NEXT: ds_read_u16 v4, v1 offset:32 ; GFX9-ALIGNED-NEXT: ds_read_u16 v1, v1 offset:34 -; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(2) -; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v1, v1, 16, v4 ; GFX9-ALIGNED-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX9-ALIGNED-NEXT: global_store_dword v0, v1, s[0:1] @@ -753,12 +744,12 @@ define amdgpu_kernel void @misaligned_2_simple_read2_f32(ptr addrspace(1) %out, ; ; GFX9-UNALIGNED-LABEL: misaligned_2_simple_read2_f32: ; GFX9-UNALIGNED: ; %bb.0: -; GFX9-UNALIGNED-NEXT: s_load_dword s2, s[0:1], 0x8 +; GFX9-UNALIGNED-NEXT: s_load_dword s0, s[2:3], 0x8 ; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v0, s2, v2 +; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v0, s0, v2 ; GFX9-UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset1:8 +; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-UNALIGNED-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-UNALIGNED-NEXT: global_store_dword v2, v0, s[0:1] @@ -781,7 +772,7 @@ define amdgpu_kernel void @simple_read2_f64(ptr addrspace(1) %out) #0 { ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read2_b64 v[0:3], v4 offset1:8 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_mov_b32_e32 v5, 0 @@ -794,7 +785,7 @@ define amdgpu_kernel void @simple_read2_f64(ptr addrspace(1) %out) #0 { ; GFX9: ; %bb.0: ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: ds_read2_b64 v[0:3], v4 offset1:8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] @@ -817,7 +808,7 @@ define amdgpu_kernel void @simple_read2_f64_max_offset(ptr addrspace(1) %out) #0 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read2_b64 v[0:3], v4 offset1:255 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_mov_b32_e32 v5, 0 @@ -830,7 +821,7 @@ define amdgpu_kernel void @simple_read2_f64_max_offset(ptr addrspace(1) %out) #0 ; GFX9: ; %bb.0: ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: ds_read2_b64 v[0:3], v4 offset1:255 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] @@ -854,7 +845,7 @@ define amdgpu_kernel void @simple_read2_f64_too_far(ptr addrspace(1) %out) #0 { ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read_b64 v[1:2], v0 ; CI-NEXT: ds_read_b64 v[3:4], v0 offset:2056 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -868,7 +859,7 @@ define amdgpu_kernel void @simple_read2_f64_too_far(ptr addrspace(1) %out) #0 { ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: ds_read_b64 v[0:1], v4 ; GFX9-NEXT: ds_read_b64 v[2:3], v4 offset:2056 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] @@ -889,15 +880,15 @@ define amdgpu_kernel void @simple_read2_f64_too_far(ptr addrspace(1) %out) #0 { define amdgpu_kernel void @misaligned_read2_f64(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 { ; CI-LABEL: misaligned_read2_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[0:1], 0x2 +; CI-NEXT: s_load_dword s0, s[2:3], 0x2 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; CI-NEXT: s_mov_b32 m0, -1 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_add_i32_e32 v3, vcc, s2, v0 +; CI-NEXT: v_add_i32_e32 v3, vcc, s0, v0 ; CI-NEXT: ds_read2_b32 v[1:2], v3 offset1:1 ; CI-NEXT: ds_read2_b32 v[3:4], v3 offset0:14 offset1:15 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_add_f64 v[2:3], v[1:2], v[3:4] @@ -907,13 +898,13 @@ define amdgpu_kernel void @misaligned_read2_f64(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: misaligned_read2_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x8 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v2, s2, v4 +; GFX9-NEXT: v_add_u32_e32 v2, s0, v4 ; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 ; GFX9-NEXT: ds_read2_b32 v[2:3], v2 offset0:14 offset1:15 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] @@ -938,7 +929,7 @@ define amdgpu_kernel void @load_constant_adjacent_offsets(ptr addrspace(1) %out) ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read_b64 v[0:1], v0 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -950,7 +941,7 @@ define amdgpu_kernel void @load_constant_adjacent_offsets(ptr addrspace(1) %out) ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: ds_read_b64 v[0:1], v2 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] @@ -968,7 +959,7 @@ define amdgpu_kernel void @load_constant_disjoint_offsets(ptr addrspace(1) %out) ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read2_b32 v[0:1], v0 offset1:2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -980,7 +971,7 @@ define amdgpu_kernel void @load_constant_disjoint_offsets(ptr addrspace(1) %out) ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:2 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] @@ -1000,7 +991,7 @@ define amdgpu_kernel void @load_misaligned64_constant_offsets(ptr addrspace(1) % ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read_b128 v[0:3], v0 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1013,7 +1004,7 @@ define amdgpu_kernel void @load_misaligned64_constant_offsets(ptr addrspace(1) % ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: ds_read_b128 v[0:3], v4 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc @@ -1035,7 +1026,7 @@ define amdgpu_kernel void @load_misaligned64_constant_large_offsets(ptr addrspac ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read_b64 v[0:1], v2 offset:16384 ; CI-NEXT: ds_read_b64 v[2:3], v2 offset:32760 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1049,7 +1040,7 @@ define amdgpu_kernel void @load_misaligned64_constant_large_offsets(ptr addrspac ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: ds_read_b64 v[0:1], v4 offset:16384 ; GFX9-NEXT: ds_read_b64 v[2:3], v4 offset:32760 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc @@ -1068,12 +1059,11 @@ define amdgpu_kernel void @load_misaligned64_constant_large_offsets(ptr addrspac define amdgpu_kernel void @sgemm_inner_loop_read2_sequence(ptr addrspace(1) %C, i32 %lda, i32 %ldb) #0 { ; CI-LABEL: sgemm_inner_loop_read2_sequence: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; CI-NEXT: s_lshl_b32 s0, s2, 2 -; CI-NEXT: s_add_i32 s1, s0, 0xc20 -; CI-NEXT: s_addk_i32 s0, 0xc60 -; CI-NEXT: v_mov_b32_e32 v0, s1 -; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: s_lshl_b32 s4, s6, 2 +; CI-NEXT: s_add_i32 s5, s4, 0xc20 +; CI-NEXT: s_addk_i32 s4, 0xc60 +; CI-NEXT: v_mov_b32_e32 v0, s5 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_lshlrev_b32_e32 v8, 2, v1 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 @@ -1081,24 +1071,29 @@ define amdgpu_kernel void @sgemm_inner_loop_read2_sequence(ptr addrspace(1) %C, ; CI-NEXT: ds_read2_b32 v[4:5], v8 offset1:1 ; CI-NEXT: ds_read2_b32 v[6:7], v8 offset0:32 offset1:33 ; CI-NEXT: ds_read2_b32 v[8:9], v8 offset0:64 offset1:65 -; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_waitcnt lgkmcnt(4) ; CI-NEXT: v_add_f32_e32 v0, v0, v1 +; CI-NEXT: s_waitcnt lgkmcnt(3) ; CI-NEXT: v_add_f32_e32 v0, v0, v2 ; CI-NEXT: v_add_f32_e32 v0, v0, v3 +; CI-NEXT: s_waitcnt lgkmcnt(2) ; CI-NEXT: v_add_f32_e32 v0, v0, v4 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; CI-NEXT: v_add_f32_e32 v0, v0, v5 +; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_add_f32_e32 v0, v0, v6 ; CI-NEXT: v_add_f32_e32 v0, v0, v7 ; CI-NEXT: v_add_f32_e32 v0, v0, v8 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_add_f32_e32 v0, v0, v9 -; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm ; ; GFX9-LABEL: sgemm_inner_loop_read2_sequence: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshl_b32 s2, s2, 2 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_lshl_b32 s2, s6, 2 ; GFX9-NEXT: s_add_i32 s3, s2, 0xc20 ; GFX9-NEXT: s_addk_i32 s2, 0xc60 ; GFX9-NEXT: v_mov_b32_e32 v0, s3 @@ -1109,16 +1104,12 @@ define amdgpu_kernel void @sgemm_inner_loop_read2_sequence(ptr addrspace(1) %C, ; GFX9-NEXT: ds_read2_b32 v[4:5], v8 offset1:1 ; GFX9-NEXT: ds_read2_b32 v[6:7], v8 offset0:32 offset1:33 ; GFX9-NEXT: ds_read2_b32 v[8:9], v8 offset0:64 offset1:65 -; GFX9-NEXT: s_waitcnt lgkmcnt(4) +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX9-NEXT: s_waitcnt lgkmcnt(3) ; GFX9-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v3 -; GFX9-NEXT: s_waitcnt lgkmcnt(2) ; GFX9-NEXT: v_add_f32_e32 v0, v0, v4 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v5 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v0, v0, v6 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v7 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v8 @@ -1172,28 +1163,28 @@ define amdgpu_kernel void @sgemm_inner_loop_read2_sequence(ptr addrspace(1) %C, define amdgpu_kernel void @misaligned_read2_v2i32(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 { ; CI-LABEL: misaligned_read2_v2i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[0:1], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; CI-NEXT: s_load_dword s4, s[2:3], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 -; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; CI-NEXT: s_endpgm ; ; GFX9-LABEL: misaligned_read2_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %load = load <2 x i32>, ptr addrspace(3) %in, align 4 store <2 x i32> %load, ptr addrspace(1) %out, align 8 @@ -1203,28 +1194,28 @@ define amdgpu_kernel void @misaligned_read2_v2i32(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @misaligned_read2_i64(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 { ; CI-LABEL: misaligned_read2_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[0:1], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; CI-NEXT: s_load_dword s4, s[2:3], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 -; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; CI-NEXT: s_endpgm ; ; GFX9-LABEL: misaligned_read2_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %load = load i64, ptr addrspace(3) %in, align 4 store i64 %load, ptr addrspace(1) %out, align 8 @@ -1234,8 +1225,8 @@ define amdgpu_kernel void @misaligned_read2_i64(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @ds_read_diff_base_interleaving( ; CI-LABEL: ds_read_diff_base_interleaving: ; CI: ; %bb.0: ; %bb -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_mov_b32 m0, -1 @@ -1265,10 +1256,10 @@ define amdgpu_kernel void @ds_read_diff_base_interleaving( ; ; GFX9-LABEL: ds_read_diff_base_interleaving: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v2, s4, v1 @@ -1470,7 +1461,7 @@ define amdgpu_kernel void @read2_v2i32_align1_odd_offset(ptr addrspace(1) %out) ; CI-NEXT: ds_read_u8 v6, v0 offset:66 ; CI-NEXT: ds_read_u8 v0, v0 offset:65 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; CI-NEXT: v_or_b32_e32 v1, v2, v1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v6 @@ -1497,7 +1488,7 @@ define amdgpu_kernel void @read2_v2i32_align1_odd_offset(ptr addrspace(1) %out) ; GFX9-ALIGNED-NEXT: ds_read_u8 v8, v2 offset:71 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(7) ; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-ALIGNED-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v1, 8, v7 @@ -1514,7 +1505,7 @@ define amdgpu_kernel void @read2_v2i32_align1_odd_offset(ptr addrspace(1) %out) ; GFX9-UNALIGNED-LABEL: read2_v2i32_align1_odd_offset: ; GFX9-UNALIGNED: ; %bb.0: ; %entry ; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-UNALIGNED-NEXT: ds_read_b64 v[0:1], v2 offset:65 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-UNALIGNED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll index 70011e56d016e..f4ec16db55d68 100644 --- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll @@ -6,7 +6,7 @@ define amdgpu_kernel void @extract_vector_elt_v2f16(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 { ; SI-LABEL: extract_vector_elt_v2f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s4, s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -21,7 +21,7 @@ define amdgpu_kernel void @extract_vector_elt_v2f16(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: extract_vector_elt_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s4, s[2:3], 0x0 ; VI-NEXT: s_mov_b32 s3, 0xf000 @@ -36,7 +36,7 @@ define amdgpu_kernel void @extract_vector_elt_v2f16(ptr addrspace(1) %out, ptr a ; ; GFX11-LABEL: extract_vector_elt_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 @@ -62,8 +62,8 @@ define amdgpu_kernel void @extract_vector_elt_v2f16(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, i32 %idx) #0 { ; SI-LABEL: extract_vector_elt_v2f16_dynamic_sgpr: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dword s0, s[2:3], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s1, s[6:7], 0x0 ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -77,8 +77,8 @@ define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_sgpr(ptr addrspace(1 ; ; VI-LABEL: extract_vector_elt_v2f16_dynamic_sgpr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s8, s[2:3], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -95,8 +95,8 @@ define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_sgpr(ptr addrspace(1 ; GFX11-LABEL: extract_vector_elt_v2f16_dynamic_sgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0 ; GFX11-NEXT: s_lshl_b32 s0, s0, 4 @@ -119,8 +119,8 @@ define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_sgpr(ptr addrspace(1 define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_vgpr(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, ptr addrspace(1) %idx.ptr) #0 { ; SI-LABEL: extract_vector_elt_v2f16_dynamic_vgpr: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 @@ -139,15 +139,15 @@ define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_vgpr(ptr addrspace(1 ; ; VI-LABEL: extract_vector_elt_v2f16_dynamic_vgpr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s3 -; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1 +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_add_u32_e32 v1, vcc, s0, v1 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc ; VI-NEXT: flat_load_dword v2, v[1:2] -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_load_dword s1, s[2:3], 0x0 @@ -162,12 +162,14 @@ define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_vgpr(ptr addrspace(1 ; ; GFX11-LABEL: extract_vector_elt_v2f16_dynamic_vgpr: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v1, s[2:3] -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: global_load_b32 v1, v1, s[0:1] +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -193,7 +195,7 @@ define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_vgpr(ptr addrspace(1 define amdgpu_kernel void @extract_vector_elt_v3f16(ptr addrspace(1) %out, <3 x half> %foo) #0 { ; SI-LABEL: extract_vector_elt_v3f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -208,7 +210,7 @@ define amdgpu_kernel void @extract_vector_elt_v3f16(ptr addrspace(1) %out, <3 x ; ; VI-LABEL: extract_vector_elt_v3f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -222,7 +224,7 @@ define amdgpu_kernel void @extract_vector_elt_v3f16(ptr addrspace(1) %out, <3 x ; ; GFX11-LABEL: extract_vector_elt_v3f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -247,8 +249,8 @@ define amdgpu_kernel void @extract_vector_elt_v3f16(ptr addrspace(1) %out, <3 x define amdgpu_kernel void @dynamic_extract_vector_elt_v3f16(ptr addrspace(1) %out, <3 x half> %foo, i32 %idx) #0 { ; SI-LABEL: dynamic_extract_vector_elt_v3f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b32 s4, s4, 4 @@ -262,8 +264,8 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v3f16(ptr addrspace(1) %ou ; ; VI-LABEL: dynamic_extract_vector_elt_v3f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s8, s[2:3], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -278,8 +280,8 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v3f16(ptr addrspace(1) %ou ; GFX11-LABEL: dynamic_extract_vector_elt_v3f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshl_b32 s4, s4, 4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -300,7 +302,7 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v3f16(ptr addrspace(1) %ou define amdgpu_kernel void @v_extractelement_v4f16_2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: v_extractelement_v4f16_2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -317,7 +319,7 @@ define amdgpu_kernel void @v_extractelement_v4f16_2(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: v_extractelement_v4f16_2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -336,7 +338,9 @@ define amdgpu_kernel void @v_extractelement_v4f16_2(ptr addrspace(1) %out, ptr a ; ; GFX11-LABEL: v_extractelement_v4f16_2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -359,7 +363,7 @@ define amdgpu_kernel void @v_extractelement_v4f16_2(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @v_insertelement_v4f16_dynamic_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: v_insertelement_v4f16_dynamic_vgpr: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 @@ -380,7 +384,7 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_vgpr(ptr addrspace(1) % ; ; VI-LABEL: v_insertelement_v4f16_dynamic_vgpr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -403,20 +407,21 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_vgpr(ptr addrspace(1) % ; ; GFX11-LABEL: v_insertelement_v4f16_dynamic_vgpr: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX11-NEXT: buffer_load_b32 v3, off, s[4:7], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[1:2], v1, s[2:3] +; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 4, v3 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b64 v[1:2], v3, v[1:2] -; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: v_lshrrev_b64 v[0:1], v3, v[0:1] +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v2 +; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -434,7 +439,7 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_vgpr(ptr addrspace(1) % define amdgpu_kernel void @reduce_load_vector_v8f16_extract_01(ptr addrspace(4) %ptr) #0 { ; SI-LABEL: reduce_load_vector_v8f16_extract_01: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s0, s[0:1], 0x0 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -451,7 +456,7 @@ define amdgpu_kernel void @reduce_load_vector_v8f16_extract_01(ptr addrspace(4) ; ; VI-LABEL: reduce_load_vector_v8f16_extract_01: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -468,7 +473,7 @@ define amdgpu_kernel void @reduce_load_vector_v8f16_extract_01(ptr addrspace(4) ; ; GFX11-LABEL: reduce_load_vector_v8f16_extract_01: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -495,7 +500,7 @@ define amdgpu_kernel void @reduce_load_vector_v8f16_extract_01(ptr addrspace(4) define amdgpu_kernel void @reduce_load_vector_v8f16_extract_23(ptr addrspace(4) %ptr) #0 { ; SI-LABEL: reduce_load_vector_v8f16_extract_23: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s0, s[0:1], 0x1 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -512,7 +517,7 @@ define amdgpu_kernel void @reduce_load_vector_v8f16_extract_23(ptr addrspace(4) ; ; VI-LABEL: reduce_load_vector_v8f16_extract_23: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -529,7 +534,7 @@ define amdgpu_kernel void @reduce_load_vector_v8f16_extract_23(ptr addrspace(4) ; ; GFX11-LABEL: reduce_load_vector_v8f16_extract_23: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -556,8 +561,8 @@ define amdgpu_kernel void @reduce_load_vector_v8f16_extract_23(ptr addrspace(4) define amdgpu_kernel void @v_extractelement_v8f16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %n) #0 { ; SI-LABEL: v_extractelement_v8f16_dynamic_sgpr: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dword s8, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 @@ -608,8 +613,8 @@ define amdgpu_kernel void @v_extractelement_v8f16_dynamic_sgpr(ptr addrspace(1) ; ; VI-LABEL: v_extractelement_v8f16_dynamic_sgpr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 4, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -651,43 +656,46 @@ define amdgpu_kernel void @v_extractelement_v8f16_dynamic_sgpr(ptr addrspace(1) ; ; GFX11-LABEL: v_extractelement_v8f16_dynamic_sgpr: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 4, v0 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v4, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v4 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b128 v[1:4], v1, s[6:7] +; GFX11-NEXT: global_load_b128 v[0:3], v0, s[6:7] ; GFX11-NEXT: s_cmp_eq_u32 s0, 1 ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s0, 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; GFX11-NEXT: s_cmp_eq_u32 s0, 3 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s0, 4 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s0, 5 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s0, 6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 1, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v3 ; GFX11-NEXT: s_cmp_eq_u32 s0, 7 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo -; GFX11-NEXT: global_store_b16 v0, v1, s[4:5] +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: global_store_b16 v2, v0, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -704,8 +712,8 @@ define amdgpu_kernel void @v_extractelement_v8f16_dynamic_sgpr(ptr addrspace(1) define amdgpu_kernel void @v_extractelement_v16f16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %n) #0 { ; SI-LABEL: v_extractelement_v16f16_dynamic_sgpr: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dword s8, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_lshlrev_b32_e32 v5, 5, v0 @@ -794,8 +802,8 @@ define amdgpu_kernel void @v_extractelement_v16f16_dynamic_sgpr(ptr addrspace(1) ; ; VI-LABEL: v_extractelement_v16f16_dynamic_sgpr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 5, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -869,78 +877,81 @@ define amdgpu_kernel void @v_extractelement_v16f16_dynamic_sgpr(ptr addrspace(1) ; ; GFX11-LABEL: v_extractelement_v16f16_dynamic_sgpr: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 5, v0 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v8, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 5, v8 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b128 v[1:4], v5, s[6:7] -; GFX11-NEXT: global_load_b128 v[5:8], v5, s[6:7] offset:16 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-NEXT: global_load_b128 v[0:3], v4, s[6:7] +; GFX11-NEXT: global_load_b128 v[4:7], v4, s[6:7] offset:16 ; GFX11-NEXT: s_cmp_eq_u32 s0, 1 ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s0, 2 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v1 ; GFX11-NEXT: s_cmp_eq_u32 s0, 3 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s0, 4 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s0, 5 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s0, 6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 1, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v3 ; GFX11-NEXT: s_cmp_eq_u32 s0, 7 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s0, 8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v4 ; GFX11-NEXT: s_cmp_eq_u32 s0, 9 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s0, 10 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v5 ; GFX11-NEXT: s_cmp_eq_u32 s0, 11 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s0, 12 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v6 ; GFX11-NEXT: s_cmp_eq_u32 s0, 13 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s0, 14 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v7 ; GFX11-NEXT: s_cmp_eq_u32 s0, 15 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo -; GFX11-NEXT: global_store_b16 v0, v1, s[4:5] +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: global_store_b16 v2, v0, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll index f34824cd6cefe..21799ab79b839 100644 --- a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll @@ -11,8 +11,8 @@ define amdgpu_kernel void @s_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { ; CI-LABEL: s_fabs_free_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[4:5], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dword s2, s[6:7], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s2, s2, 0x7fff ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -23,8 +23,8 @@ define amdgpu_kernel void @s_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { ; ; VI-LABEL: s_fabs_free_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s2, s2, 0x7fff ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -35,8 +35,8 @@ define amdgpu_kernel void @s_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { ; ; GFX9-LABEL: s_fabs_free_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff @@ -47,10 +47,10 @@ define amdgpu_kernel void @s_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { ; GFX11-LABEL: s_fabs_free_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff +; GFX11-NEXT: s_and_b32 s2, s4, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -66,8 +66,8 @@ define amdgpu_kernel void @s_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { define amdgpu_kernel void @s_fabs_f16(ptr addrspace(1) %out, half %in) { ; CI-LABEL: s_fabs_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[4:5], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dword s2, s[6:7], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s2, s2, 0x7fff ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -78,8 +78,8 @@ define amdgpu_kernel void @s_fabs_f16(ptr addrspace(1) %out, half %in) { ; ; VI-LABEL: s_fabs_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s2, s2, 0x7fff ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -90,8 +90,8 @@ define amdgpu_kernel void @s_fabs_f16(ptr addrspace(1) %out, half %in) { ; ; GFX9-LABEL: s_fabs_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff @@ -102,10 +102,10 @@ define amdgpu_kernel void @s_fabs_f16(ptr addrspace(1) %out, half %in) { ; GFX11-LABEL: s_fabs_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff +; GFX11-NEXT: s_and_b32 s2, s4, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -120,8 +120,8 @@ define amdgpu_kernel void @s_fabs_f16(ptr addrspace(1) %out, half %in) { define amdgpu_kernel void @s_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) { ; CI-LABEL: s_fabs_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[4:5], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dword s2, s[6:7], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -132,8 +132,8 @@ define amdgpu_kernel void @s_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) { ; ; VI-LABEL: s_fabs_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -144,8 +144,8 @@ define amdgpu_kernel void @s_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) { ; ; GFX9-LABEL: s_fabs_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff7fff @@ -156,10 +156,10 @@ define amdgpu_kernel void @s_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) { ; GFX11-LABEL: s_fabs_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff7fff +; GFX11-NEXT: s_and_b32 s2, s4, 0x7fff7fff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -174,7 +174,7 @@ define amdgpu_kernel void @s_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) { define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) { ; CI-LABEL: s_fabs_v4f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s3, s3, 0x7fff7fff ; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff @@ -187,7 +187,7 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) { ; ; VI-LABEL: s_fabs_v4f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s3, s3, 0x7fff7fff ; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff @@ -200,7 +200,7 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) { ; ; GFX9-LABEL: s_fabs_v4f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s3, s3, 0x7fff7fff @@ -212,7 +212,7 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) { ; ; GFX11-LABEL: s_fabs_v4f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff7fff ; GFX11-NEXT: s_and_b32 s3, s3, 0x7fff7fff @@ -231,12 +231,12 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) { define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half %in1) { ; CI-LABEL: fabs_fold_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[4:5], 0x2 +; CI-NEXT: s_load_dword s0, s[6:7], 0x2 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e64 v0, |s0| ; CI-NEXT: s_lshr_b32 s0, s0, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_mul_f32_e32 v0, v0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -247,8 +247,8 @@ define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half ; ; VI-LABEL: fabs_fold_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s3 @@ -260,8 +260,8 @@ define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half ; ; GFX9-LABEL: fabs_fold_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s3, s2, 16 @@ -273,13 +273,13 @@ define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half ; GFX11-LABEL: fabs_fold_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s3, s2, 16 +; GFX11-NEXT: s_lshr_b32 s2, s4, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mul_f16_e64 v1, |s2|, s3 +; GFX11-NEXT: v_mul_f16_e64 v1, |s4|, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -293,7 +293,7 @@ define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half define amdgpu_kernel void @v_fabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: v_fabs_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -307,7 +307,7 @@ define amdgpu_kernel void @v_fabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; VI-LABEL: v_fabs_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -321,7 +321,7 @@ define amdgpu_kernel void @v_fabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: v_fabs_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -332,7 +332,9 @@ define amdgpu_kernel void @v_fabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX11-LABEL: v_fabs_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] @@ -354,8 +356,8 @@ define amdgpu_kernel void @v_fabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @fabs_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { ; CI-LABEL: fabs_free_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[4:5], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dword s2, s[6:7], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -366,8 +368,8 @@ define amdgpu_kernel void @fabs_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { ; ; VI-LABEL: fabs_free_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -378,8 +380,8 @@ define amdgpu_kernel void @fabs_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { ; ; GFX9-LABEL: fabs_free_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff7fff @@ -390,10 +392,10 @@ define amdgpu_kernel void @fabs_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { ; GFX11-LABEL: fabs_free_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff7fff +; GFX11-NEXT: s_and_b32 s2, s4, 0x7fff7fff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -411,7 +413,7 @@ define amdgpu_kernel void @fabs_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { define amdgpu_kernel void @v_fabs_fold_self_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: v_fabs_fold_self_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -437,7 +439,7 @@ define amdgpu_kernel void @v_fabs_fold_self_v2f16(ptr addrspace(1) %out, ptr add ; ; VI-LABEL: v_fabs_fold_self_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -455,7 +457,7 @@ define amdgpu_kernel void @v_fabs_fold_self_v2f16(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: v_fabs_fold_self_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -468,14 +470,15 @@ define amdgpu_kernel void @v_fabs_fold_self_v2f16(ptr addrspace(1) %out, ptr add ; ; GFX11-LABEL: v_fabs_fold_self_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_mul_f16 v0, v1, v0 ; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -493,8 +496,8 @@ define amdgpu_kernel void @v_fabs_fold_self_v2f16(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @v_fabs_fold_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %other.val) #0 { ; CI-LABEL: v_fabs_fold_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: s_load_dword s4, s[4:5], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dword s4, s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -521,8 +524,8 @@ define amdgpu_kernel void @v_fabs_fold_v2f16(ptr addrspace(1) %out, ptr addrspac ; ; VI-LABEL: v_fabs_fold_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dword s4, s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -542,28 +545,30 @@ define amdgpu_kernel void @v_fabs_fold_v2f16(ptr addrspace(1) %out, ptr addrspac ; ; GFX9-LABEL: v_fabs_fold_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 -; GFX9-NEXT: v_pk_mul_f16 v0, v0, s6 +; GFX9-NEXT: v_pk_mul_f16 v0, v0, s4 ; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_fabs_fold_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x10 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_mul_f16 v0, v0, s0 ; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX11-NEXT: s_nop 0 @@ -582,7 +587,7 @@ define amdgpu_kernel void @v_fabs_fold_v2f16(ptr addrspace(1) %out, ptr addrspac define amdgpu_kernel void @v_extract_fabs_fold_v2f16(ptr addrspace(1) %in) #0 { ; CI-LABEL: v_extract_fabs_fold_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -605,7 +610,7 @@ define amdgpu_kernel void @v_extract_fabs_fold_v2f16(ptr addrspace(1) %in) #0 { ; ; VI-LABEL: v_extract_fabs_fold_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -624,7 +629,7 @@ define amdgpu_kernel void @v_extract_fabs_fold_v2f16(ptr addrspace(1) %in) #0 { ; ; GFX9-LABEL: v_extract_fabs_fold_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x4000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -640,7 +645,9 @@ define amdgpu_kernel void @v_extract_fabs_fold_v2f16(ptr addrspace(1) %in) #0 { ; ; GFX11-LABEL: v_extract_fabs_fold_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] @@ -673,7 +680,7 @@ define amdgpu_kernel void @v_extract_fabs_fold_v2f16(ptr addrspace(1) %in) #0 { define amdgpu_kernel void @v_extract_fabs_no_fold_v2f16(ptr addrspace(1) %in) #0 { ; CI-LABEL: v_extract_fabs_no_fold_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -691,7 +698,7 @@ define amdgpu_kernel void @v_extract_fabs_no_fold_v2f16(ptr addrspace(1) %in) #0 ; ; VI-LABEL: v_extract_fabs_no_fold_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -709,7 +716,7 @@ define amdgpu_kernel void @v_extract_fabs_no_fold_v2f16(ptr addrspace(1) %in) #0 ; ; GFX9-LABEL: v_extract_fabs_no_fold_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v0, s[0:1] @@ -723,7 +730,9 @@ define amdgpu_kernel void @v_extract_fabs_no_fold_v2f16(ptr addrspace(1) %in) #0 ; ; GFX11-LABEL: v_extract_fabs_no_fold_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/fabs.ll b/llvm/test/CodeGen/AMDGPU/fabs.ll index 07581ade57ccd..60e19dcd48f1e 100644 --- a/llvm/test/CodeGen/AMDGPU/fabs.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.ll @@ -39,25 +39,25 @@ define amdgpu_kernel void @s_fabsf_fn_free(ptr addrspace(1) %out, i32 %in) { define amdgpu_kernel void @s_fabsf_free(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: s_fabsf_free: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bitset0_b32 s0, 31 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_bitset0_b32 s4, 31 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_fabsf_free: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: s_bitset0_b32 s0, 31 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_bitset0_b32 s2, 31 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %bc= bitcast i32 %in to float @@ -69,25 +69,25 @@ define amdgpu_kernel void @s_fabsf_free(ptr addrspace(1) %out, i32 %in) { define amdgpu_kernel void @s_fabsf_f32(ptr addrspace(1) %out, float %in) { ; SI-LABEL: s_fabsf_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bitset0_b32 s0, 31 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_bitset0_b32 s4, 31 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_fabsf_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: s_bitset0_b32 s0, 31 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_bitset0_b32 s2, 31 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %fabs = call float @llvm.fabs.f32(float %in) @@ -98,7 +98,7 @@ define amdgpu_kernel void @s_fabsf_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @fabs_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; SI-LABEL: fabs_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -113,7 +113,7 @@ define amdgpu_kernel void @fabs_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; VI-LABEL: fabs_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitset0_b32 s3, 31 ; VI-NEXT: s_bitset0_b32 s2, 31 @@ -131,26 +131,26 @@ define amdgpu_kernel void @fabs_v2f32(ptr addrspace(1) %out, <2 x float> %in) { define amdgpu_kernel void @fabsf_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; SI-LABEL: fabsf_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bitset0_b32 s3, 31 -; SI-NEXT: s_bitset0_b32 s2, 31 -; SI-NEXT: s_bitset0_b32 s1, 31 -; SI-NEXT: s_bitset0_b32 s0, 31 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: v_mov_b32_e32 v2, s2 -; SI-NEXT: v_mov_b32_e32 v3, s3 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: s_bitset0_b32 s7, 31 +; SI-NEXT: s_bitset0_b32 s6, 31 +; SI-NEXT: s_bitset0_b32 s5, 31 +; SI-NEXT: s_bitset0_b32 s4, 31 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fabsf_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_bitset0_b32 s3, 31 @@ -202,7 +202,7 @@ define amdgpu_kernel void @fabsf_fn_fold(ptr addrspace(1) %out, float %in0, floa define amdgpu_kernel void @fabs_fold(ptr addrspace(1) %out, float %in0, float %in1) { ; SI-LABEL: fabs_fold: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -215,7 +215,7 @@ define amdgpu_kernel void @fabs_fold(ptr addrspace(1) %out, float %in0, float %i ; ; VI-LABEL: fabs_fold: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s3 ; VI-NEXT: v_mul_f32_e64 v2, |s2|, v0 @@ -232,23 +232,23 @@ define amdgpu_kernel void @fabs_fold(ptr addrspace(1) %out, float %in0, float %i define amdgpu_kernel void @bitpreserve_fabsf_f32(ptr addrspace(1) %out, float %in) { ; SI-LABEL: bitpreserve_fabsf_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f32_e64 v0, |s0|, 1.0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: v_add_f32_e64 v0, |s4|, 1.0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: bitpreserve_fabsf_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_add_f32_e64 v2, |s0|, 1.0 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_add_f32_e64 v2, |s2|, 1.0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %in.bc = bitcast float %in to i32 diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll index d53c0411ad88c..f0ce96af90649 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll @@ -23,7 +23,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 define amdgpu_kernel void @v_test_canonicalize_var_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: v_test_canonicalize_var_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -35,7 +35,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_f32(ptr addrspace(1) %out) #1 ; ; GFX9-LABEL: v_test_canonicalize_var_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -46,7 +46,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_f32(ptr addrspace(1) %out) #1 ; ; GFX11-LABEL: v_test_canonicalize_var_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] @@ -59,7 +59,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_f32(ptr addrspace(1) %out) #1 ; ; GFX12-LABEL: v_test_canonicalize_var_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[0:1] @@ -78,8 +78,8 @@ define amdgpu_kernel void @v_test_canonicalize_var_f32(ptr addrspace(1) %out) #1 define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, float %val) #1 { ; GFX6-LABEL: s_test_canonicalize_var_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s2, s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s2, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mul_f32_e64 v2, 1.0, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -89,8 +89,8 @@ define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, fl ; ; GFX8-LABEL: s_test_canonicalize_var_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_f32_e64 v2, 1.0, s2 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -100,8 +100,8 @@ define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, fl ; ; GFX9-LABEL: s_test_canonicalize_var_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_max_f32_e64 v1, s2, s2 @@ -111,11 +111,11 @@ define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, fl ; GFX11-LABEL: s_test_canonicalize_var_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_max_f32_e64 v1, s2, s2 +; GFX11-NEXT: v_max_f32_e64 v1, s4, s4 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -123,7 +123,7 @@ define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, fl ; ; GFX12-LABEL: s_test_canonicalize_var_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 +; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_max_num_f32_e64 v1, s2, s2 @@ -139,7 +139,7 @@ define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, fl define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: v_test_canonicalize_fabs_var_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -151,7 +151,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_test_canonicalize_fabs_var_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -162,7 +162,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(ptr addrspace(1) %ou ; ; GFX11-LABEL: v_test_canonicalize_fabs_var_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] @@ -175,7 +175,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(ptr addrspace(1) %ou ; ; GFX12-LABEL: v_test_canonicalize_fabs_var_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[0:1] @@ -195,7 +195,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(ptr addrspace(1) %ou define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: v_test_canonicalize_fneg_fabs_var_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -207,7 +207,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(ptr addrspace(1 ; ; GFX9-LABEL: v_test_canonicalize_fneg_fabs_var_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -218,7 +218,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(ptr addrspace(1 ; ; GFX11-LABEL: v_test_canonicalize_fneg_fabs_var_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] @@ -231,7 +231,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(ptr addrspace(1 ; ; GFX12-LABEL: v_test_canonicalize_fneg_fabs_var_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[0:1] @@ -252,7 +252,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(ptr addrspace(1 define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: v_test_canonicalize_fneg_var_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -264,7 +264,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_test_canonicalize_fneg_var_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -275,7 +275,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(ptr addrspace(1) %ou ; ; GFX11-LABEL: v_test_canonicalize_fneg_var_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] @@ -288,7 +288,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(ptr addrspace(1) %ou ; ; GFX12-LABEL: v_test_canonicalize_fneg_var_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[0:1] @@ -308,7 +308,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(ptr addrspace(1) %ou define amdgpu_kernel void @test_fold_canonicalize_undef_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_undef_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -318,7 +318,7 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_f32(ptr addrspace(1) %ou ; ; GFX9-LABEL: test_fold_canonicalize_undef_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v0, v0, s[0:1] @@ -326,7 +326,7 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_f32(ptr addrspace(1) %ou ; ; GFX11-LABEL: test_fold_canonicalize_undef_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v0, s[0:1] @@ -336,7 +336,7 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_f32(ptr addrspace(1) %ou ; ; GFX12-LABEL: test_fold_canonicalize_undef_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v0, s[0:1] @@ -351,7 +351,7 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_f32(ptr addrspace(1) %ou define amdgpu_kernel void @test_fold_canonicalize_p0_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_p0_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -361,7 +361,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f32(ptr addrspace(1) %out) ; ; GFX9-LABEL: test_fold_canonicalize_p0_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v0, v0, s[0:1] @@ -369,7 +369,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f32(ptr addrspace(1) %out) ; ; GFX11-LABEL: test_fold_canonicalize_p0_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v0, s[0:1] @@ -379,7 +379,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f32(ptr addrspace(1) %out) ; ; GFX12-LABEL: test_fold_canonicalize_p0_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v0, s[0:1] @@ -394,7 +394,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f32(ptr addrspace(1) %out) define amdgpu_kernel void @test_fold_canonicalize_n0_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_n0_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -404,7 +404,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f32(ptr addrspace(1) %out) ; ; GFX9-LABEL: test_fold_canonicalize_n0_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -413,7 +413,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f32(ptr addrspace(1) %out) ; ; GFX11-LABEL: test_fold_canonicalize_n0_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -424,7 +424,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f32(ptr addrspace(1) %out) ; ; GFX12-LABEL: test_fold_canonicalize_n0_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -440,7 +440,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f32(ptr addrspace(1) %out) define amdgpu_kernel void @test_fold_canonicalize_p1_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_p1_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 1.0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -450,7 +450,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f32(ptr addrspace(1) %out) ; ; GFX9-LABEL: test_fold_canonicalize_p1_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -459,7 +459,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f32(ptr addrspace(1) %out) ; ; GFX11-LABEL: test_fold_canonicalize_p1_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 1.0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -469,7 +469,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f32(ptr addrspace(1) %out) ; ; GFX12-LABEL: test_fold_canonicalize_p1_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 1.0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -484,7 +484,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f32(ptr addrspace(1) %out) define amdgpu_kernel void @test_fold_canonicalize_n1_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_n1_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, -1.0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -494,7 +494,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f32(ptr addrspace(1) %out) ; ; GFX9-LABEL: test_fold_canonicalize_n1_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, -1.0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -503,7 +503,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f32(ptr addrspace(1) %out) ; ; GFX11-LABEL: test_fold_canonicalize_n1_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, -1.0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -513,7 +513,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f32(ptr addrspace(1) %out) ; ; GFX12-LABEL: test_fold_canonicalize_n1_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, -1.0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -528,7 +528,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f32(ptr addrspace(1) %out) define amdgpu_kernel void @test_fold_canonicalize_literal_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_literal_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x41800000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -538,7 +538,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f32(ptr addrspace(1) % ; ; GFX9-LABEL: test_fold_canonicalize_literal_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x41800000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -547,7 +547,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f32(ptr addrspace(1) % ; ; GFX11-LABEL: test_fold_canonicalize_literal_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x41800000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -557,7 +557,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f32(ptr addrspace(1) % ; ; GFX12-LABEL: test_fold_canonicalize_literal_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x41800000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -572,7 +572,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f32(ptr addrspace(1) % define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -582,7 +582,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(ptr ; ; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v0, v0, s[0:1] @@ -590,7 +590,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(ptr ; ; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v0, s[0:1] @@ -600,7 +600,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(ptr ; ; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v0, s[0:1] @@ -615,7 +615,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(ptr define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dynamic(ptr addrspace(1) %out) #5 { ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: s_mov_b32 s2, 0x7fffff ; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -626,7 +626,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; ; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: s_mov_b32 s2, 0x7fffff ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_max_f32_e64 v1, s2, s2 @@ -636,7 +636,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; ; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_max_f32_e64 v1, 0x7fffff, 0x7fffff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -647,7 +647,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; ; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: v_max_num_f32_e64 v1, 0x7fffff, 0x7fffff ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -663,7 +663,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_out(ptr addrspace(1) %out) #6 { ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_out: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: s_mov_b32 s2, 0x7fffff ; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -674,7 +674,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; ; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_out: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: s_mov_b32 s2, 0x7fffff ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_max_f32_e64 v1, s2, s2 @@ -684,7 +684,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; ; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_out: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_max_f32_e64 v1, 0x7fffff, 0x7fffff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -695,7 +695,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; ; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_out: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: v_max_num_f32_e64 v1, 0x7fffff, 0x7fffff ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -711,7 +711,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_in(ptr addrspace(1) %out) #7 { ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_in: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: s_mov_b32 s2, 0x7fffff ; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -722,7 +722,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; ; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_in: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: s_mov_b32 s2, 0x7fffff ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_max_f32_e64 v1, s2, s2 @@ -732,7 +732,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; ; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_in: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_max_f32_e64 v1, 0x7fffff, 0x7fffff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -743,7 +743,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; ; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_in: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: v_max_num_f32_e64 v1, 0x7fffff, 0x7fffff ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -759,7 +759,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f32(ptr addrspace(1) %out) #3 { ; GFX678-LABEL: test_denormals_fold_canonicalize_denormal0_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fffff ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -769,7 +769,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f32(ptr ad ; ; GFX9-LABEL: test_denormals_fold_canonicalize_denormal0_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -778,7 +778,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f32(ptr ad ; ; GFX11-LABEL: test_denormals_fold_canonicalize_denormal0_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fffff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -788,7 +788,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f32(ptr ad ; ; GFX12-LABEL: test_denormals_fold_canonicalize_denormal0_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fffff ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -803,7 +803,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f32(ptr ad define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal1_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -813,7 +813,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f32(ptr ; ; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal1_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -822,7 +822,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f32(ptr ; ; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal1_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -833,7 +833,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f32(ptr ; ; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal1_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -849,7 +849,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f32(ptr define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f32(ptr addrspace(1) %out) #3 { ; GFX678-LABEL: test_denormals_fold_canonicalize_denormal1_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x807fffff ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -859,7 +859,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f32(ptr ad ; ; GFX9-LABEL: test_denormals_fold_canonicalize_denormal1_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x807fffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -868,7 +868,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f32(ptr ad ; ; GFX11-LABEL: test_denormals_fold_canonicalize_denormal1_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x807fffff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -878,7 +878,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f32(ptr ad ; ; GFX12-LABEL: test_denormals_fold_canonicalize_denormal1_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x807fffff ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -893,7 +893,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f32(ptr ad define amdgpu_kernel void @test_fold_canonicalize_qnan_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_qnan_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -903,7 +903,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f32(ptr addrspace(1) %out ; ; GFX9-LABEL: test_fold_canonicalize_qnan_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -912,7 +912,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f32(ptr addrspace(1) %out ; ; GFX11-LABEL: test_fold_canonicalize_qnan_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -922,7 +922,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f32(ptr addrspace(1) %out ; ; GFX12-LABEL: test_fold_canonicalize_qnan_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -937,7 +937,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f32(ptr addrspace(1) %out define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg1_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -947,7 +947,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f32(ptr addrsp ; ; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg1_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -956,7 +956,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f32(ptr addrsp ; ; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg1_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -966,7 +966,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f32(ptr addrsp ; ; GFX12-LABEL: test_fold_canonicalize_qnan_value_neg1_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -981,7 +981,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f32(ptr addrsp define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg2_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -991,7 +991,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f32(ptr addrsp ; ; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg2_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1000,7 +1000,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f32(ptr addrsp ; ; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg2_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1010,7 +1010,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f32(ptr addrsp ; ; GFX12-LABEL: test_fold_canonicalize_qnan_value_neg2_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1025,7 +1025,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f32(ptr addrsp define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_snan0_value_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -1035,7 +1035,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f32(ptr addrspace( ; ; GFX9-LABEL: test_fold_canonicalize_snan0_value_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1044,7 +1044,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f32(ptr addrspace( ; ; GFX11-LABEL: test_fold_canonicalize_snan0_value_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1054,7 +1054,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f32(ptr addrspace( ; ; GFX12-LABEL: test_fold_canonicalize_snan0_value_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1069,7 +1069,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f32(ptr addrspace( define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_snan1_value_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -1079,7 +1079,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f32(ptr addrspace( ; ; GFX9-LABEL: test_fold_canonicalize_snan1_value_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1088,7 +1088,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f32(ptr addrspace( ; ; GFX11-LABEL: test_fold_canonicalize_snan1_value_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1098,7 +1098,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f32(ptr addrspace( ; ; GFX12-LABEL: test_fold_canonicalize_snan1_value_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1113,7 +1113,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f32(ptr addrspace( define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_snan2_value_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -1123,7 +1123,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f32(ptr addrspace( ; ; GFX9-LABEL: test_fold_canonicalize_snan2_value_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1132,7 +1132,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f32(ptr addrspace( ; ; GFX11-LABEL: test_fold_canonicalize_snan2_value_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1142,7 +1142,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f32(ptr addrspace( ; ; GFX12-LABEL: test_fold_canonicalize_snan2_value_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1157,7 +1157,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f32(ptr addrspace( define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_snan3_value_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -1167,7 +1167,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f32(ptr addrspace( ; ; GFX9-LABEL: test_fold_canonicalize_snan3_value_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1176,7 +1176,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f32(ptr addrspace( ; ; GFX11-LABEL: test_fold_canonicalize_snan3_value_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1186,7 +1186,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f32(ptr addrspace( ; ; GFX12-LABEL: test_fold_canonicalize_snan3_value_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1201,7 +1201,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f32(ptr addrspace( define amdgpu_kernel void @v_test_canonicalize_var_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: v_test_canonicalize_var_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -1213,7 +1213,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_f64(ptr addrspace(1) %out) #1 ; ; GFX9-LABEL: v_test_canonicalize_var_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -1224,7 +1224,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_f64(ptr addrspace(1) %out) #1 ; ; GFX11-LABEL: v_test_canonicalize_var_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -1237,7 +1237,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_f64(ptr addrspace(1) %out) #1 ; ; GFX12-LABEL: v_test_canonicalize_var_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -1256,7 +1256,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_f64(ptr addrspace(1) %out) #1 define amdgpu_kernel void @s_test_canonicalize_var_f64(ptr addrspace(1) %out, double %val) #1 { ; GFX6-LABEL: s_test_canonicalize_var_f64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_max_f64 v[2:3], s[2:3], s[2:3] ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -1266,7 +1266,7 @@ define amdgpu_kernel void @s_test_canonicalize_var_f64(ptr addrspace(1) %out, do ; ; GFX8-LABEL: s_test_canonicalize_var_f64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_max_f64 v[0:1], s[2:3], s[2:3] ; GFX8-NEXT: v_mov_b32_e32 v2, s0 @@ -1276,7 +1276,7 @@ define amdgpu_kernel void @s_test_canonicalize_var_f64(ptr addrspace(1) %out, do ; ; GFX9-LABEL: s_test_canonicalize_var_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_max_f64 v[0:1], s[2:3], s[2:3] @@ -1285,7 +1285,7 @@ define amdgpu_kernel void @s_test_canonicalize_var_f64(ptr addrspace(1) %out, do ; ; GFX11-LABEL: s_test_canonicalize_var_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_max_f64 v[0:1], s[2:3], s[2:3] @@ -1296,7 +1296,7 @@ define amdgpu_kernel void @s_test_canonicalize_var_f64(ptr addrspace(1) %out, do ; ; GFX12-LABEL: s_test_canonicalize_var_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e64 v[0:1], s[2:3], s[2:3] @@ -1312,7 +1312,7 @@ define amdgpu_kernel void @s_test_canonicalize_var_f64(ptr addrspace(1) %out, do define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: v_test_canonicalize_fabs_var_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -1324,7 +1324,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_test_canonicalize_fabs_var_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -1335,7 +1335,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(ptr addrspace(1) %ou ; ; GFX11-LABEL: v_test_canonicalize_fabs_var_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -1348,7 +1348,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(ptr addrspace(1) %ou ; ; GFX12-LABEL: v_test_canonicalize_fabs_var_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -1368,7 +1368,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(ptr addrspace(1) %ou define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: v_test_canonicalize_fneg_fabs_var_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -1380,7 +1380,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(ptr addrspace(1 ; ; GFX9-LABEL: v_test_canonicalize_fneg_fabs_var_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -1391,7 +1391,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(ptr addrspace(1 ; ; GFX11-LABEL: v_test_canonicalize_fneg_fabs_var_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -1404,7 +1404,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(ptr addrspace(1 ; ; GFX12-LABEL: v_test_canonicalize_fneg_fabs_var_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -1425,7 +1425,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(ptr addrspace(1 define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: v_test_canonicalize_fneg_var_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -1437,7 +1437,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_test_canonicalize_fneg_var_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -1448,7 +1448,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(ptr addrspace(1) %ou ; ; GFX11-LABEL: v_test_canonicalize_fneg_var_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -1461,7 +1461,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(ptr addrspace(1) %ou ; ; GFX12-LABEL: v_test_canonicalize_fneg_var_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -1481,7 +1481,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(ptr addrspace(1) %ou define amdgpu_kernel void @test_fold_canonicalize_p0_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_p0_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, v0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1492,7 +1492,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f64(ptr addrspace(1) %out) ; ; GFX9-LABEL: test_fold_canonicalize_p0_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1501,7 +1501,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f64(ptr addrspace(1) %out) ; ; GFX11-LABEL: test_fold_canonicalize_p0_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v1, v0 @@ -1513,7 +1513,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f64(ptr addrspace(1) %out) ; ; GFX12-LABEL: test_fold_canonicalize_p0_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v1, v0 @@ -1530,7 +1530,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f64(ptr addrspace(1) %out) define amdgpu_kernel void @test_fold_canonicalize_n0_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_n0_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1541,7 +1541,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f64(ptr addrspace(1) %out) ; ; GFX9-LABEL: test_fold_canonicalize_n0_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1550,7 +1550,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f64(ptr addrspace(1) %out) ; ; GFX11-LABEL: test_fold_canonicalize_n0_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1561,7 +1561,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f64(ptr addrspace(1) %out) ; ; GFX12-LABEL: test_fold_canonicalize_n0_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1577,7 +1577,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f64(ptr addrspace(1) %out) define amdgpu_kernel void @test_fold_canonicalize_p1_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_p1_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, 0x3ff00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1588,7 +1588,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f64(ptr addrspace(1) %out) ; ; GFX9-LABEL: test_fold_canonicalize_p1_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x3ff00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1597,7 +1597,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f64(ptr addrspace(1) %out) ; ; GFX11-LABEL: test_fold_canonicalize_p1_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1607,7 +1607,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f64(ptr addrspace(1) %out) ; ; GFX12-LABEL: test_fold_canonicalize_p1_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff00000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1622,7 +1622,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f64(ptr addrspace(1) %out) define amdgpu_kernel void @test_fold_canonicalize_n1_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_n1_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, 0xbff00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1633,7 +1633,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f64(ptr addrspace(1) %out) ; ; GFX9-LABEL: test_fold_canonicalize_n1_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xbff00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1642,7 +1642,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f64(ptr addrspace(1) %out) ; ; GFX11-LABEL: test_fold_canonicalize_n1_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xbff00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1652,7 +1652,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f64(ptr addrspace(1) %out) ; ; GFX12-LABEL: test_fold_canonicalize_n1_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xbff00000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1667,7 +1667,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f64(ptr addrspace(1) %out) define amdgpu_kernel void @test_fold_canonicalize_literal_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_literal_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, 0x40300000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1678,7 +1678,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f64(ptr addrspace(1) % ; ; GFX9-LABEL: test_fold_canonicalize_literal_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40300000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1687,7 +1687,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f64(ptr addrspace(1) % ; ; GFX11-LABEL: test_fold_canonicalize_literal_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x40300000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1697,7 +1697,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f64(ptr addrspace(1) % ; ; GFX12-LABEL: test_fold_canonicalize_literal_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x40300000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1712,7 +1712,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f64(ptr addrspace(1) % define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(ptr addrspace(1) %out) #2 { ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, v0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1723,7 +1723,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(ptr ; ; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal0_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1732,7 +1732,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(ptr ; ; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v1, v0 @@ -1744,7 +1744,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(ptr ; ; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal0_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v1, v0 @@ -1761,7 +1761,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(ptr define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(ptr addrspace(1) %out) #3 { ; GFX678-LABEL: test_denormals_fold_canonicalize_denormal0_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, -1 ; GFX678-NEXT: v_mov_b32_e32 v1, 0xfffff ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1772,7 +1772,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(ptr ad ; ; GFX9-LABEL: test_denormals_fold_canonicalize_denormal0_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, -1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xfffff @@ -1782,7 +1782,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(ptr ad ; ; GFX11-LABEL: test_denormals_fold_canonicalize_denormal0_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0xfffff ; GFX11-NEXT: v_mov_b32_e32 v0, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1793,7 +1793,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(ptr ad ; ; GFX12-LABEL: test_denormals_fold_canonicalize_denormal0_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0xfffff ; GFX12-NEXT: v_mov_b32_e32 v0, -1 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1809,7 +1809,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(ptr ad define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(ptr addrspace(1) %out) #2 { ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal1_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1820,7 +1820,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(ptr ; ; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal1_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1829,7 +1829,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(ptr ; ; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal1_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1840,7 +1840,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(ptr ; ; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal1_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1856,7 +1856,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(ptr define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(ptr addrspace(1) %out) #3 { ; GFX678-LABEL: test_denormals_fold_canonicalize_denormal1_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, -1 ; GFX678-NEXT: v_mov_b32_e32 v1, 0x800fffff ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1867,7 +1867,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(ptr ad ; ; GFX9-LABEL: test_denormals_fold_canonicalize_denormal1_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, -1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x800fffff @@ -1877,7 +1877,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(ptr ad ; ; GFX11-LABEL: test_denormals_fold_canonicalize_denormal1_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x800fffff ; GFX11-NEXT: v_mov_b32_e32 v0, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1888,7 +1888,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(ptr ad ; ; GFX12-LABEL: test_denormals_fold_canonicalize_denormal1_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x800fffff ; GFX12-NEXT: v_mov_b32_e32 v0, -1 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1904,7 +1904,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(ptr ad define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_qnan_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1915,7 +1915,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(ptr addrspace(1) %out ; ; GFX9-LABEL: test_fold_canonicalize_qnan_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1924,7 +1924,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(ptr addrspace(1) %out ; ; GFX11-LABEL: test_fold_canonicalize_qnan_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1934,7 +1934,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(ptr addrspace(1) %out ; ; GFX12-LABEL: test_fold_canonicalize_qnan_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1949,7 +1949,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(ptr addrspace(1) %out define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg1_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1960,7 +1960,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(ptr addrsp ; ; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg1_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1969,7 +1969,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(ptr addrsp ; ; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg1_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1979,7 +1979,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(ptr addrsp ; ; GFX12-LABEL: test_fold_canonicalize_qnan_value_neg1_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1994,7 +1994,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(ptr addrsp define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg2_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -2005,7 +2005,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(ptr addrsp ; ; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg2_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2014,7 +2014,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(ptr addrsp ; ; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg2_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -2024,7 +2024,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(ptr addrsp ; ; GFX12-LABEL: test_fold_canonicalize_qnan_value_neg2_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -2039,7 +2039,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(ptr addrsp define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_snan0_value_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -2050,7 +2050,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(ptr addrspace( ; ; GFX9-LABEL: test_fold_canonicalize_snan0_value_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2059,7 +2059,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(ptr addrspace( ; ; GFX11-LABEL: test_fold_canonicalize_snan0_value_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -2069,7 +2069,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(ptr addrspace( ; ; GFX12-LABEL: test_fold_canonicalize_snan0_value_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -2084,7 +2084,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(ptr addrspace( define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_snan1_value_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -2095,7 +2095,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(ptr addrspace( ; ; GFX9-LABEL: test_fold_canonicalize_snan1_value_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2104,7 +2104,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(ptr addrspace( ; ; GFX11-LABEL: test_fold_canonicalize_snan1_value_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -2114,7 +2114,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(ptr addrspace( ; ; GFX12-LABEL: test_fold_canonicalize_snan1_value_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -2129,7 +2129,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(ptr addrspace( define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_snan2_value_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -2140,7 +2140,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(ptr addrspace( ; ; GFX9-LABEL: test_fold_canonicalize_snan2_value_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2149,7 +2149,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(ptr addrspace( ; ; GFX11-LABEL: test_fold_canonicalize_snan2_value_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -2159,7 +2159,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(ptr addrspace( ; ; GFX12-LABEL: test_fold_canonicalize_snan2_value_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -2174,7 +2174,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(ptr addrspace( define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_snan3_value_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -2185,7 +2185,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(ptr addrspace( ; ; GFX9-LABEL: test_fold_canonicalize_snan3_value_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2194,7 +2194,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(ptr addrspace( ; ; GFX11-LABEL: test_fold_canonicalize_snan3_value_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -2204,7 +2204,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(ptr addrspace( ; ; GFX12-LABEL: test_fold_canonicalize_snan3_value_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -2219,7 +2219,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(ptr addrspace( define amdgpu_kernel void @test_canonicalize_value_f64_flush(ptr addrspace(1) %arg, ptr addrspace(1) %out) #4 { ; GFX6-LABEL: test_canonicalize_value_f64_flush: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 @@ -2236,7 +2236,7 @@ define amdgpu_kernel void @test_canonicalize_value_f64_flush(ptr addrspace(1) %a ; ; GFX8-LABEL: test_canonicalize_value_f64_flush: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2253,7 +2253,7 @@ define amdgpu_kernel void @test_canonicalize_value_f64_flush(ptr addrspace(1) %a ; ; GFX9-LABEL: test_canonicalize_value_f64_flush: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -2264,7 +2264,9 @@ define amdgpu_kernel void @test_canonicalize_value_f64_flush(ptr addrspace(1) %a ; ; GFX11-LABEL: test_canonicalize_value_f64_flush: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -2277,7 +2279,9 @@ define amdgpu_kernel void @test_canonicalize_value_f64_flush(ptr addrspace(1) %a ; ; GFX12-LABEL: test_canonicalize_value_f64_flush: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -2299,7 +2303,7 @@ define amdgpu_kernel void @test_canonicalize_value_f64_flush(ptr addrspace(1) %a define amdgpu_kernel void @test_canonicalize_value_f32_flush(ptr addrspace(1) %arg, ptr addrspace(1) %out) #4 { ; GFX6-LABEL: test_canonicalize_value_f32_flush: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 @@ -2316,7 +2320,7 @@ define amdgpu_kernel void @test_canonicalize_value_f32_flush(ptr addrspace(1) %a ; ; GFX8-LABEL: test_canonicalize_value_f32_flush: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2333,7 +2337,7 @@ define amdgpu_kernel void @test_canonicalize_value_f32_flush(ptr addrspace(1) %a ; ; GFX9-LABEL: test_canonicalize_value_f32_flush: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -2344,7 +2348,9 @@ define amdgpu_kernel void @test_canonicalize_value_f32_flush(ptr addrspace(1) %a ; ; GFX11-LABEL: test_canonicalize_value_f32_flush: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] @@ -2357,7 +2363,9 @@ define amdgpu_kernel void @test_canonicalize_value_f32_flush(ptr addrspace(1) %a ; ; GFX12-LABEL: test_canonicalize_value_f32_flush: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[0:1] @@ -2379,7 +2387,7 @@ define amdgpu_kernel void @test_canonicalize_value_f32_flush(ptr addrspace(1) %a define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %arg, ptr addrspace(1) %out) #4 { ; GFX6-LABEL: test_canonicalize_value_f16_flush: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 @@ -2397,7 +2405,7 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a ; ; GFX8-LABEL: test_canonicalize_value_f16_flush: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2414,7 +2422,7 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a ; ; GFX9-LABEL: test_canonicalize_value_f16_flush: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] @@ -2425,7 +2433,9 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a ; ; GFX11-LABEL: test_canonicalize_value_f16_flush: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[0:1] @@ -2438,7 +2448,9 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a ; ; GFX12-LABEL: test_canonicalize_value_f16_flush: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v1, v0, s[0:1] @@ -2461,7 +2473,7 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1) %arg, ptr addrspace(1) %out) #4 { ; GFX6-LABEL: test_canonicalize_value_v2f16_flush: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 @@ -2484,7 +2496,7 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1) ; ; GFX8-LABEL: test_canonicalize_value_v2f16_flush: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2504,7 +2516,7 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1) ; ; GFX9-LABEL: test_canonicalize_value_v2f16_flush: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -2515,7 +2527,9 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1) ; ; GFX11-LABEL: test_canonicalize_value_v2f16_flush: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] @@ -2528,7 +2542,9 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1) ; ; GFX12-LABEL: test_canonicalize_value_v2f16_flush: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[0:1] @@ -2550,7 +2566,7 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1) define amdgpu_kernel void @test_canonicalize_value_f64_denorm(ptr addrspace(1) %arg, ptr addrspace(1) %out) #3 { ; GFX6-LABEL: test_canonicalize_value_f64_denorm: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 @@ -2567,7 +2583,7 @@ define amdgpu_kernel void @test_canonicalize_value_f64_denorm(ptr addrspace(1) % ; ; GFX8-LABEL: test_canonicalize_value_f64_denorm: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2584,7 +2600,7 @@ define amdgpu_kernel void @test_canonicalize_value_f64_denorm(ptr addrspace(1) % ; ; GFX9-LABEL: test_canonicalize_value_f64_denorm: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -2595,7 +2611,9 @@ define amdgpu_kernel void @test_canonicalize_value_f64_denorm(ptr addrspace(1) % ; ; GFX11-LABEL: test_canonicalize_value_f64_denorm: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -2608,7 +2626,9 @@ define amdgpu_kernel void @test_canonicalize_value_f64_denorm(ptr addrspace(1) % ; ; GFX12-LABEL: test_canonicalize_value_f64_denorm: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -2630,7 +2650,7 @@ define amdgpu_kernel void @test_canonicalize_value_f64_denorm(ptr addrspace(1) % define amdgpu_kernel void @test_canonicalize_value_f32_denorm(ptr addrspace(1) %arg, ptr addrspace(1) %out) #3 { ; GFX6-LABEL: test_canonicalize_value_f32_denorm: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 @@ -2647,7 +2667,7 @@ define amdgpu_kernel void @test_canonicalize_value_f32_denorm(ptr addrspace(1) % ; ; GFX8-LABEL: test_canonicalize_value_f32_denorm: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2664,7 +2684,7 @@ define amdgpu_kernel void @test_canonicalize_value_f32_denorm(ptr addrspace(1) % ; ; GFX9-LABEL: test_canonicalize_value_f32_denorm: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -2675,7 +2695,9 @@ define amdgpu_kernel void @test_canonicalize_value_f32_denorm(ptr addrspace(1) % ; ; GFX11-LABEL: test_canonicalize_value_f32_denorm: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] @@ -2688,7 +2710,9 @@ define amdgpu_kernel void @test_canonicalize_value_f32_denorm(ptr addrspace(1) % ; ; GFX12-LABEL: test_canonicalize_value_f32_denorm: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[0:1] @@ -2711,7 +2735,7 @@ define amdgpu_kernel void @test_canonicalize_value_f32_denorm(ptr addrspace(1) % define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) %arg, ptr addrspace(1) %out) #3 { ; GFX6-LABEL: test_canonicalize_value_f16_denorm: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 @@ -2729,7 +2753,7 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) % ; ; GFX8-LABEL: test_canonicalize_value_f16_denorm: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2746,7 +2770,7 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) % ; ; GFX9-LABEL: test_canonicalize_value_f16_denorm: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] @@ -2757,7 +2781,9 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) % ; ; GFX11-LABEL: test_canonicalize_value_f16_denorm: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[0:1] @@ -2770,7 +2796,9 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) % ; ; GFX12-LABEL: test_canonicalize_value_f16_denorm: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v1, v0, s[0:1] @@ -2794,7 +2822,7 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) % define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1) %arg, ptr addrspace(1) %out) #3 { ; GFX6-LABEL: test_canonicalize_value_v2f16_denorm: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 @@ -2817,7 +2845,7 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1) ; ; GFX8-LABEL: test_canonicalize_value_v2f16_denorm: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2836,7 +2864,7 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1) ; ; GFX9-LABEL: test_canonicalize_value_v2f16_denorm: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -2847,7 +2875,9 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1) ; ; GFX11-LABEL: test_canonicalize_value_v2f16_denorm: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] @@ -2860,7 +2890,9 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1) ; ; GFX12-LABEL: test_canonicalize_value_v2f16_denorm: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[0:1] @@ -2882,7 +2914,7 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1) define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out) #1 { ; GFX6-LABEL: v_test_canonicalize_var_v2f64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 @@ -2899,7 +2931,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out) ; ; GFX8-LABEL: v_test_canonicalize_var_v2f64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2916,7 +2948,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out) ; ; GFX9-LABEL: v_test_canonicalize_var_v2f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2929,9 +2961,11 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out) ; ; GFX11-LABEL: v_test_canonicalize_var_v2f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v0, s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2944,9 +2978,11 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out) ; ; GFX12-LABEL: v_test_canonicalize_var_v2f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: v_mov_b32_e32 v4, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b128 v[0:3], v0, s[0:1] ; GFX12-NEXT: s_wait_loadcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll index ae280c5a443e1..fe672f1b3b131 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll @@ -15,30 +15,31 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 define amdgpu_kernel void @s_copysign_f16(ptr addrspace(1) %arg_out, half %mag, half %sign) { ; SI-LABEL: s_copysign_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dword s0, s[2:3], 0xb ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; SI-NEXT: s_lshr_b32 s2, s2, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; SI-NEXT: s_lshr_b32 s0, s0, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_brev_b32 s2, -2 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: v_bfi_b32 v0, s2, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_copysign_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; VI-NEXT: s_movk_i32 s3, 0x7fff +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_movk_i32 s2, 0x7fff ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s4, s2, 16 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_bfi_b32 v2, s3, v0, v1 +; VI-NEXT: s_lshr_b32 s3, s4, 16 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_bfi_b32 v2, s2, v0, v1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_short v[0:1], v2 @@ -46,29 +47,29 @@ define amdgpu_kernel void @s_copysign_f16(ptr addrspace(1) %arg_out, half %mag, ; ; GFX9-LABEL: s_copysign_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_movk_i32 s0, 0x7fff +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_movk_i32 s2, 0x7fff ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s1, s4, 16 +; GFX9-NEXT: s_lshr_b32 s3, s4, 16 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_bfi_b32 v1, s0, v1, v2 -; GFX9-NEXT: global_store_short v0, v1, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: v_bfi_b32 v1, s2, v1, v2 +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_copysign_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s3, s2, 16 +; GFX11-NEXT: s_lshr_b32 s2, s4, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v0, s3 -; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s2, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s4, v0 ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -81,8 +82,8 @@ define amdgpu_kernel void @s_copysign_f16(ptr addrspace(1) %arg_out, half %mag, define amdgpu_kernel void @s_test_copysign_f16_0(ptr addrspace(1) %out, half %mag) { ; SI-LABEL: s_test_copysign_f16_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -93,10 +94,10 @@ define amdgpu_kernel void @s_test_copysign_f16_0(ptr addrspace(1) %out, half %ma ; ; VI-LABEL: s_test_copysign_f16_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s2, 0x7fff +; VI-NEXT: s_and_b32 s2, s4, 0x7fff ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -105,22 +106,22 @@ define amdgpu_kernel void @s_test_copysign_f16_0(ptr addrspace(1) %out, half %ma ; ; GFX9-LABEL: s_test_copysign_f16_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, s4, 0x7fff -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_short v0, v1, s[2:3] +; GFX9-NEXT: s_and_b32 s2, s4, 0x7fff +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f16_0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff +; GFX11-NEXT: s_and_b32 s2, s4, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -135,8 +136,8 @@ define amdgpu_kernel void @s_test_copysign_f16_0(ptr addrspace(1) %out, half %ma define amdgpu_kernel void @s_test_copysign_f16_1(ptr addrspace(1) %out, half %mag) { ; SI-LABEL: s_test_copysign_f16_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -147,10 +148,10 @@ define amdgpu_kernel void @s_test_copysign_f16_1(ptr addrspace(1) %out, half %ma ; ; VI-LABEL: s_test_copysign_f16_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s2, 0x7fff +; VI-NEXT: s_and_b32 s2, s4, 0x7fff ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -159,22 +160,22 @@ define amdgpu_kernel void @s_test_copysign_f16_1(ptr addrspace(1) %out, half %ma ; ; GFX9-LABEL: s_test_copysign_f16_1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, s4, 0x7fff -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_short v0, v1, s[2:3] +; GFX9-NEXT: s_and_b32 s2, s4, 0x7fff +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f16_1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff +; GFX11-NEXT: s_and_b32 s2, s4, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -189,8 +190,8 @@ define amdgpu_kernel void @s_test_copysign_f16_1(ptr addrspace(1) %out, half %ma define amdgpu_kernel void @s_test_copysign_f16_10.0(ptr addrspace(1) %out, half %mag) { ; SI-LABEL: s_test_copysign_f16_10.0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -201,10 +202,10 @@ define amdgpu_kernel void @s_test_copysign_f16_10.0(ptr addrspace(1) %out, half ; ; VI-LABEL: s_test_copysign_f16_10.0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s2, 0x7fff +; VI-NEXT: s_and_b32 s2, s4, 0x7fff ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -213,22 +214,22 @@ define amdgpu_kernel void @s_test_copysign_f16_10.0(ptr addrspace(1) %out, half ; ; GFX9-LABEL: s_test_copysign_f16_10.0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, s4, 0x7fff -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_short v0, v1, s[2:3] +; GFX9-NEXT: s_and_b32 s2, s4, 0x7fff +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f16_10.0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff +; GFX11-NEXT: s_and_b32 s2, s4, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -243,8 +244,8 @@ define amdgpu_kernel void @s_test_copysign_f16_10.0(ptr addrspace(1) %out, half define amdgpu_kernel void @s_test_copysign_f16_neg1(ptr addrspace(1) %out, half %mag) { ; SI-LABEL: s_test_copysign_f16_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -255,10 +256,10 @@ define amdgpu_kernel void @s_test_copysign_f16_neg1(ptr addrspace(1) %out, half ; ; VI-LABEL: s_test_copysign_f16_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bitset1_b32 s2, 15 +; VI-NEXT: s_or_b32 s2, s4, 0x8000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -267,22 +268,22 @@ define amdgpu_kernel void @s_test_copysign_f16_neg1(ptr addrspace(1) %out, half ; ; GFX9-LABEL: s_test_copysign_f16_neg1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_or_b32 s0, s4, 0x8000 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_short v0, v1, s[2:3] +; GFX9-NEXT: s_or_b32 s2, s4, 0x8000 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f16_neg1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_bitset1_b32 s2, 15 +; GFX11-NEXT: s_or_b32 s2, s4, 0x8000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -297,8 +298,8 @@ define amdgpu_kernel void @s_test_copysign_f16_neg1(ptr addrspace(1) %out, half define amdgpu_kernel void @s_test_copysign_f16_neg10(ptr addrspace(1) %out, half %mag) { ; SI-LABEL: s_test_copysign_f16_neg10: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -309,10 +310,10 @@ define amdgpu_kernel void @s_test_copysign_f16_neg10(ptr addrspace(1) %out, half ; ; VI-LABEL: s_test_copysign_f16_neg10: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bitset1_b32 s2, 15 +; VI-NEXT: s_or_b32 s2, s4, 0x8000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -321,22 +322,22 @@ define amdgpu_kernel void @s_test_copysign_f16_neg10(ptr addrspace(1) %out, half ; ; GFX9-LABEL: s_test_copysign_f16_neg10: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_or_b32 s0, s4, 0x8000 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_short v0, v1, s[2:3] +; GFX9-NEXT: s_or_b32 s2, s4, 0x8000 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f16_neg10: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_bitset1_b32 s2, 15 +; GFX11-NEXT: s_or_b32 s2, s4, 0x8000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -351,25 +352,26 @@ define amdgpu_kernel void @s_test_copysign_f16_neg10(ptr addrspace(1) %out, half define amdgpu_kernel void @s_test_copysign_f16_0_mag(ptr addrspace(1) %out, half %sign) { ; SI-LABEL: s_test_copysign_f16_0_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dword s0, s[2:3], 0xb ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_brev_b32 s2, -2 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: v_bfi_b32 v0, s2, 0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_f16_0_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0xffff8000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v2, s2, v0 +; VI-NEXT: v_and_b32_e32 v2, s4, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_short v[0:1], v2 @@ -377,23 +379,23 @@ define amdgpu_kernel void @s_test_copysign_f16_0_mag(ptr addrspace(1) %out, half ; ; GFX9-LABEL: s_test_copysign_f16_0_mag: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff8000 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX9-NEXT: global_store_short v0, v1, s[2:3] +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f16_0_mag: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e64 v1, 0xffff8000, s2 +; GFX11-NEXT: v_and_b32_e64 v1, 0xffff8000, s4 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -407,25 +409,26 @@ define amdgpu_kernel void @s_test_copysign_f16_0_mag(ptr addrspace(1) %out, half define amdgpu_kernel void @s_test_copysign_f16_1_mag(ptr addrspace(1) %out, half %sign) { ; SI-LABEL: s_test_copysign_f16_1_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dword s0, s[2:3], 0xb ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_brev_b32 s2, -2 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: v_bfi_b32 v0, s2, 1.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_f16_1_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0xffff8000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v0, s2, v0 +; VI-NEXT: v_and_b32_e32 v0, s4, v0 ; VI-NEXT: v_or_b32_e32 v2, 0x3c00, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -434,24 +437,24 @@ define amdgpu_kernel void @s_test_copysign_f16_1_mag(ptr addrspace(1) %out, half ; ; GFX9-LABEL: s_test_copysign_f16_1_mag: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff8000 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX9-NEXT: v_or_b32_e32 v1, 0x3c00, v1 -; GFX9-NEXT: global_store_short v0, v1, s[2:3] +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f16_1_mag: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s2 +; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_or_b32_e32 v0, 0x3c00, v0 ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] @@ -466,26 +469,27 @@ define amdgpu_kernel void @s_test_copysign_f16_1_mag(ptr addrspace(1) %out, half define amdgpu_kernel void @s_test_copysign_f16_10_mag(ptr addrspace(1) %out, half %sign) { ; SI-LABEL: s_test_copysign_f16_10_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dword s0, s[2:3], 0xb ; SI-NEXT: v_mov_b32_e32 v1, 0x41200000 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_brev_b32 s2, -2 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: v_bfi_b32 v0, s2, v1, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_f16_10_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0xffff8000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v0, s2, v0 +; VI-NEXT: v_and_b32_e32 v0, s4, v0 ; VI-NEXT: v_or_b32_e32 v2, 0x4900, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -494,24 +498,24 @@ define amdgpu_kernel void @s_test_copysign_f16_10_mag(ptr addrspace(1) %out, hal ; ; GFX9-LABEL: s_test_copysign_f16_10_mag: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff8000 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX9-NEXT: v_or_b32_e32 v1, 0x4900, v1 -; GFX9-NEXT: global_store_short v0, v1, s[2:3] +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f16_10_mag: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s2 +; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_or_b32_e32 v0, 0x4900, v0 ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] @@ -526,25 +530,26 @@ define amdgpu_kernel void @s_test_copysign_f16_10_mag(ptr addrspace(1) %out, hal define amdgpu_kernel void @s_test_copysign_f16_neg1_mag(ptr addrspace(1) %out, half %sign) { ; SI-LABEL: s_test_copysign_f16_neg1_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dword s0, s[2:3], 0xb ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_brev_b32 s2, -2 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: v_bfi_b32 v0, s2, -1.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_f16_neg1_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0xffff8000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v0, s2, v0 +; VI-NEXT: v_and_b32_e32 v0, s4, v0 ; VI-NEXT: v_or_b32_e32 v2, 0x3c00, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -553,24 +558,24 @@ define amdgpu_kernel void @s_test_copysign_f16_neg1_mag(ptr addrspace(1) %out, h ; ; GFX9-LABEL: s_test_copysign_f16_neg1_mag: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff8000 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX9-NEXT: v_or_b32_e32 v1, 0x3c00, v1 -; GFX9-NEXT: global_store_short v0, v1, s[2:3] +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f16_neg1_mag: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s2 +; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_or_b32_e32 v0, 0x3c00, v0 ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] @@ -585,26 +590,27 @@ define amdgpu_kernel void @s_test_copysign_f16_neg1_mag(ptr addrspace(1) %out, h define amdgpu_kernel void @s_test_copysign_f16_neg10_mag(ptr addrspace(1) %out, half %sign) { ; SI-LABEL: s_test_copysign_f16_neg10_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dword s0, s[2:3], 0xb ; SI-NEXT: v_mov_b32_e32 v1, 0xc1200000 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_brev_b32 s2, -2 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: v_bfi_b32 v0, s2, v1, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_f16_neg10_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0xffff8000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v0, s2, v0 +; VI-NEXT: v_and_b32_e32 v0, s4, v0 ; VI-NEXT: v_or_b32_e32 v2, 0x4900, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -613,24 +619,24 @@ define amdgpu_kernel void @s_test_copysign_f16_neg10_mag(ptr addrspace(1) %out, ; ; GFX9-LABEL: s_test_copysign_f16_neg10_mag: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff8000 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX9-NEXT: v_or_b32_e32 v1, 0x4900, v1 -; GFX9-NEXT: global_store_short v0, v1, s[2:3] +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f16_neg10_mag: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s2 +; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_or_b32_e32 v0, 0x4900, v0 ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] @@ -823,8 +829,8 @@ define half @v_test_copysign_f16_neg10(half %mag) { define amdgpu_kernel void @v_copysign_out_f32_mag_f16_sign_f32(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) { ; SI-LABEL: v_copysign_out_f32_mag_f16_sign_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: s_mov_b32 s15, s11 @@ -849,8 +855,8 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f16_sign_f32(ptr addrspace(1) ; ; VI-LABEL: v_copysign_out_f32_mag_f16_sign_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -874,17 +880,17 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f16_sign_f32(ptr addrspace(1) ; ; GFX9-LABEL: v_copysign_out_f32_mag_f16_sign_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_brev_b32 s0, -2 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v1, s[6:7] -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_load_dword v0, v0, s[2:3] -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX9-NEXT: global_load_dword v0, v0, s[0:1] +; GFX9-NEXT: s_brev_b32 s0, -2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_bfi_b32 v0, s0, v1, v0 ; GFX9-NEXT: global_store_dword v2, v0, s[4:5] @@ -893,8 +899,10 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f16_sign_f32(ptr addrspace(1) ; GFX11-LABEL: v_copysign_out_f32_mag_f16_sign_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 1, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -923,8 +931,8 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f16_sign_f32(ptr addrspace(1) define amdgpu_kernel void @v_copysign_out_f64_mag_f16_sign_f64(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) { ; SI-LABEL: v_copysign_out_f64_mag_f16_sign_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: s_mov_b32 s15, s11 @@ -949,8 +957,8 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f16_sign_f64(ptr addrspace(1) ; ; VI-LABEL: v_copysign_out_f64_mag_f16_sign_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -974,15 +982,15 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f16_sign_f64(ptr addrspace(1) ; ; GFX9-LABEL: v_copysign_out_f64_mag_f16_sign_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX9-NEXT: s_brev_b32 s0, -2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v2, v1, s[6:7] ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] +; GFX9-NEXT: s_brev_b32 s0, -2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v2 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[2:3], v0 @@ -993,9 +1001,12 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f16_sign_f64(ptr addrspace(1) ; ; GFX11-LABEL: v_copysign_out_f64_mag_f16_sign_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v2, v1, s[6:7] @@ -1024,8 +1035,8 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f16_sign_f64(ptr addrspace(1) define amdgpu_kernel void @v_copysign_out_f32_mag_f32_sign_f16(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) { ; SI-LABEL: v_copysign_out_f32_mag_f32_sign_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: s_mov_b32 s15, s11 @@ -1049,8 +1060,8 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f32_sign_f16(ptr addrspace(1) ; ; VI-LABEL: v_copysign_out_f32_mag_f32_sign_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1074,14 +1085,14 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f32_sign_f16(ptr addrspace(1) ; ; GFX9-LABEL: v_copysign_out_f32_mag_f32_sign_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_brev_b32 s0, -2 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v1, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_ushort v1, v1, s[0:1] +; GFX9-NEXT: s_brev_b32 s0, -2 ; GFX9-NEXT: global_load_dword v0, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -1093,8 +1104,10 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f32_sign_f16(ptr addrspace(1) ; GFX11-LABEL: v_copysign_out_f32_mag_f32_sign_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 1, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1123,8 +1136,8 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f32_sign_f16(ptr addrspace(1) define amdgpu_kernel void @v_copysign_out_f64_mag_f64_sign_f16(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) { ; SI-LABEL: v_copysign_out_f64_mag_f64_sign_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: s_mov_b32 s15, s11 @@ -1150,8 +1163,8 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f64_sign_f16(ptr addrspace(1) ; ; VI-LABEL: v_copysign_out_f64_mag_f64_sign_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1175,14 +1188,14 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f64_sign_f16(ptr addrspace(1) ; ; GFX9-LABEL: v_copysign_out_f64_mag_f64_sign_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX9-NEXT: s_brev_b32 s0, -2 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v2, v1, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_ushort v2, v1, s[0:1] +; GFX9-NEXT: s_brev_b32 s0, -2 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -1194,10 +1207,12 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f64_sign_f16(ptr addrspace(1) ; GFX11-LABEL: v_copysign_out_f64_mag_f64_sign_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v0 -; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v0, 3, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v2, v1, s[4:5] ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] @@ -1224,8 +1239,8 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f64_sign_f16(ptr addrspace(1) define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f32(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) { ; SI-LABEL: v_copysign_out_f16_mag_f16_sign_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: s_mov_b32 s15, s11 @@ -1251,8 +1266,8 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f32(ptr addrspace(1) ; ; VI-LABEL: v_copysign_out_f16_mag_f16_sign_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1276,14 +1291,14 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f32(ptr addrspace(1) ; ; GFX9-LABEL: v_copysign_out_f16_mag_f16_sign_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_movk_i32 s0, 0x7fff -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v1, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v1, s[0:1] +; GFX9-NEXT: s_movk_i32 s0, 0x7fff ; GFX9-NEXT: global_load_ushort v0, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -1295,8 +1310,10 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f32(ptr addrspace(1) ; GFX11-LABEL: v_copysign_out_f16_mag_f16_sign_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 2, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1325,35 +1342,35 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f32(ptr addrspace(1) define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f64(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) { ; SI-LABEL: v_copysign_out_f16_mag_f16_sign_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: s_mov_b32 s15, s11 ; SI-NEXT: buffer_load_ushort v2, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64 +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 ; SI-NEXT: v_bfi_b32 v0, s0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_copysign_out_f16_mag_f16_sign_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 @@ -1375,15 +1392,15 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f64(ptr addrspace(1) ; ; GFX9-LABEL: v_copysign_out_f16_mag_f16_sign_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX9-NEXT: s_movk_i32 s0, 0x7fff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] +; GFX9-NEXT: s_movk_i32 s0, 0x7fff ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_bfi_b32 v1, s0, v2, v1 @@ -1393,10 +1410,12 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f64(ptr addrspace(1) ; GFX11-LABEL: v_copysign_out_f16_mag_f16_sign_f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[4:5] ; GFX11-NEXT: global_load_u16 v0, v2, s[2:3] @@ -1423,8 +1442,8 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f64(ptr addrspace(1) define amdgpu_kernel void @v_copysign_out_f16_mag_f32_sign_f16(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) { ; SI-LABEL: v_copysign_out_f16_mag_f32_sign_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: s_mov_b32 s15, s11 @@ -1452,8 +1471,8 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f32_sign_f16(ptr addrspace(1) ; ; VI-LABEL: v_copysign_out_f16_mag_f32_sign_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1477,17 +1496,17 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f32_sign_f16(ptr addrspace(1) ; ; GFX9-LABEL: v_copysign_out_f16_mag_f32_sign_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_movk_i32 s0, 0x7fff +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v1, s[6:7] -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_load_ushort v0, v0, s[2:3] -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX9-NEXT: global_load_ushort v0, v0, s[0:1] +; GFX9-NEXT: s_movk_i32 s0, 0x7fff ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_bfi_b32 v0, s0, v1, v0 ; GFX9-NEXT: global_store_short v2, v0, s[4:5] @@ -1496,8 +1515,10 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f32_sign_f16(ptr addrspace(1) ; GFX11-LABEL: v_copysign_out_f16_mag_f32_sign_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 2, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1526,8 +1547,8 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f32_sign_f16(ptr addrspace(1) define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1) %arg_out, double %mag, half %sign) { ; SI-LABEL: s_copysign_out_f16_mag_f64_sign_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 ; SI-NEXT: s_lshr_b32 s4, s3, 8 @@ -1590,8 +1611,8 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1) ; ; VI-LABEL: s_copysign_out_f16_mag_f64_sign_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s8, s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s0, s7, 8 ; VI-NEXT: s_and_b32 s1, s7, 0x1ff @@ -1639,10 +1660,7 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1) ; VI-NEXT: v_or_b32_e32 v2, 0x7c00, v2 ; VI-NEXT: v_mov_b32_e32 v3, s2 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_lshr_b32 s0, s7, 16 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; VI-NEXT: s_and_b32 s0, s0, 0x8000 -; VI-NEXT: v_or_b32_e32 v2, s0, v2 ; VI-NEXT: s_movk_i32 s0, 0x7fff ; VI-NEXT: v_mov_b32_e32 v3, s8 ; VI-NEXT: v_bfi_b32 v2, s0, v2, v3 @@ -1651,8 +1669,8 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1) ; ; GFX9-LABEL: s_copysign_out_f16_mag_f64_sign_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s0, s7, 8 @@ -1673,36 +1691,33 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1) ; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s2 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, s0, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX9-NEXT: s_add_i32 s9, s1, 0xfffffc10 +; GFX9-NEXT: s_add_i32 s7, s1, 0xfffffc10 ; GFX9-NEXT: v_readfirstlane_b32 s0, v1 -; GFX9-NEXT: s_lshl_b32 s1, s9, 12 +; GFX9-NEXT: s_lshl_b32 s1, s7, 12 ; GFX9-NEXT: s_or_b32 s0, s2, s0 ; GFX9-NEXT: s_or_b32 s1, s6, s1 -; GFX9-NEXT: s_cmp_lt_i32 s9, 1 -; GFX9-NEXT: s_cselect_b32 s10, s0, s1 -; GFX9-NEXT: s_and_b32 s2, s10, 7 +; GFX9-NEXT: s_cmp_lt_i32 s7, 1 +; GFX9-NEXT: s_cselect_b32 s9, s0, s1 +; GFX9-NEXT: s_and_b32 s2, s9, 7 ; GFX9-NEXT: s_cmp_gt_i32 s2, 5 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s2, 3 ; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] -; GFX9-NEXT: s_lshr_b32 s2, s10, 2 +; GFX9-NEXT: s_lshr_b32 s2, s9, 2 ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_addc_u32 s0, s2, 0 -; GFX9-NEXT: s_cmp_lt_i32 s9, 31 +; GFX9-NEXT: s_cmp_lt_i32 s7, 31 ; GFX9-NEXT: s_cselect_b32 s2, s0, 0x7c00 ; GFX9-NEXT: s_cmp_lg_u32 s6, 0 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 9, v1 -; GFX9-NEXT: s_cmpk_eq_i32 s9, 0x40f +; GFX9-NEXT: s_cmpk_eq_i32 s7, 0x40f ; GFX9-NEXT: v_or_b32_e32 v1, 0x7c00, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_lshr_b32 s0, s7, 16 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX9-NEXT: s_and_b32 s0, s0, 0x8000 -; GFX9-NEXT: v_or_b32_e32 v1, s0, v1 ; GFX9-NEXT: s_movk_i32 s0, 0x7fff ; GFX9-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-NEXT: v_bfi_b32 v1, s0, v1, v2 @@ -1712,8 +1727,8 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1) ; GFX11-LABEL: s_copysign_out_f16_mag_f64_sign_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s1, s7, 0x1ff ; GFX11-NEXT: s_lshr_b32 s2, s7, 8 @@ -1728,13 +1743,13 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1) ; GFX11-NEXT: s_addk_i32 s1, 0xfc10 ; GFX11-NEXT: v_med3_i32 v1, s3, 0, 13 ; GFX11-NEXT: v_readfirstlane_b32 s3, v0 -; GFX11-NEXT: s_lshl_b32 s8, s1, 12 +; GFX11-NEXT: s_lshl_b32 s7, s1, 12 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_readfirstlane_b32 s6, v1 ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: s_or_b32 s3, s2, 0x1000 -; GFX11-NEXT: s_or_b32 s8, s2, s8 +; GFX11-NEXT: s_or_b32 s7, s2, s7 ; GFX11-NEXT: s_lshr_b32 s6, s3, s6 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_lshlrev_b32_e64 v0, v1, s6 @@ -1745,15 +1760,15 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1) ; GFX11-NEXT: v_readfirstlane_b32 s3, v0 ; GFX11-NEXT: s_or_b32 s3, s6, s3 ; GFX11-NEXT: s_cmp_lt_i32 s1, 1 -; GFX11-NEXT: s_cselect_b32 s3, s3, s8 +; GFX11-NEXT: s_cselect_b32 s3, s3, s7 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s6, s3, 7 ; GFX11-NEXT: s_cmp_gt_i32 s6, 5 -; GFX11-NEXT: s_cselect_b32 s8, -1, 0 +; GFX11-NEXT: s_cselect_b32 s7, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s6, 3 ; GFX11-NEXT: s_cselect_b32 s6, -1, 0 ; GFX11-NEXT: s_lshr_b32 s3, s3, 2 -; GFX11-NEXT: s_or_b32 s6, s6, s8 +; GFX11-NEXT: s_or_b32 s6, s6, s7 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_cmp_lg_u32 s6, 0 ; GFX11-NEXT: s_addc_u32 s3, s3, 0 @@ -1764,15 +1779,11 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1) ; GFX11-NEXT: s_cmpk_eq_i32 s1, 0x40f ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-NEXT: s_lshr_b32 s1, s7, 16 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: s_and_b32 s1, s1, 0x8000 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 9, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 9, v0 ; GFX11-NEXT: v_or_b32_e32 v0, 0x7c00, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, s3, v0, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_or_b32_e32 v0, s1, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, s3, v0, vcc_lo ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, s0 ; GFX11-NEXT: global_store_b16 v1, v0, s[4:5] ; GFX11-NEXT: s_nop 0 @@ -1787,7 +1798,7 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1) define amdgpu_kernel void @s_copysign_v2f16(ptr addrspace(1) %arg_out, <2 x half> %arg_mag, <2 x half> %arg_sign) { ; SI-LABEL: s_copysign_v2f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1811,7 +1822,7 @@ define amdgpu_kernel void @s_copysign_v2f16(ptr addrspace(1) %arg_out, <2 x half ; ; VI-LABEL: s_copysign_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_movk_i32 s4, 0x7fff ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -1831,7 +1842,7 @@ define amdgpu_kernel void @s_copysign_v2f16(ptr addrspace(1) %arg_out, <2 x half ; ; GFX9-LABEL: s_copysign_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: s_movk_i32 s0, 0x7fff ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1850,7 +1861,7 @@ define amdgpu_kernel void @s_copysign_v2f16(ptr addrspace(1) %arg_out, <2 x half ; ; GFX11-LABEL: s_copysign_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v0, s3 @@ -1876,8 +1887,8 @@ define amdgpu_kernel void @s_copysign_v2f16(ptr addrspace(1) %arg_out, <2 x half define amdgpu_kernel void @s_copysign_v3f16(ptr addrspace(1) %arg_out, <3 x half> %arg_mag, <3 x half> %arg_sign) { ; SI-LABEL: s_copysign_v3f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshr_b32 s2, s4, 16 @@ -1904,8 +1915,8 @@ define amdgpu_kernel void @s_copysign_v3f16(ptr addrspace(1) %arg_out, <3 x half ; ; VI-LABEL: s_copysign_v3f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_movk_i32 s2, 0x7fff ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -1933,33 +1944,33 @@ define amdgpu_kernel void @s_copysign_v3f16(ptr addrspace(1) %arg_out, <3 x half ; ; GFX9-LABEL: s_copysign_v3f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_movk_i32 s0, 0x7fff +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_movk_i32 s2, 0x7fff ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: s_lshr_b32 s1, s6, 16 +; GFX9-NEXT: s_lshr_b32 s3, s6, 16 ; GFX9-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NEXT: v_bfi_b32 v1, s0, v1, v2 +; GFX9-NEXT: v_bfi_b32 v1, s2, v1, v2 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_bfi_b32 v2, s0, v2, v3 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_bfi_b32 v2, s2, v2, v3 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: v_bfi_b32 v2, s0, v2, v3 -; GFX9-NEXT: global_store_short v0, v2, s[2:3] offset:4 -; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: v_bfi_b32 v2, s2, v2, v3 +; GFX9-NEXT: global_store_short v0, v2, s[0:1] offset:4 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_copysign_v3f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshr_b32 s2, s6, 16 @@ -1988,8 +1999,8 @@ define amdgpu_kernel void @s_copysign_v3f16(ptr addrspace(1) %arg_out, <3 x half define amdgpu_kernel void @s_copysign_v4f16(ptr addrspace(1) %arg_out, <4 x half> %arg_mag, <4 x half> %arg_sign) { ; SI-LABEL: s_copysign_v4f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2023,8 +2034,8 @@ define amdgpu_kernel void @s_copysign_v4f16(ptr addrspace(1) %arg_out, <4 x half ; ; VI-LABEL: s_copysign_v4f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_movk_i32 s2, 0x7fff ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s5 @@ -2054,39 +2065,39 @@ define amdgpu_kernel void @s_copysign_v4f16(ptr addrspace(1) %arg_out, <4 x half ; ; GFX9-LABEL: s_copysign_v4f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_movk_i32 s0, 0x7fff +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_movk_i32 s2, 0x7fff ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_lshr_b32 s1, s7, 16 +; GFX9-NEXT: s_lshr_b32 s3, s7, 16 ; GFX9-NEXT: s_lshr_b32 s5, s5, 16 -; GFX9-NEXT: v_bfi_b32 v0, s0, v0, v1 +; GFX9-NEXT: v_bfi_b32 v0, s2, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_bfi_b32 v1, s0, v1, v3 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_bfi_b32 v1, s2, v1, v3 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-NEXT: s_lshr_b32 s1, s6, 16 +; GFX9-NEXT: s_lshr_b32 s3, s6, 16 ; GFX9-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NEXT: v_bfi_b32 v0, s0, v0, v3 +; GFX9-NEXT: v_bfi_b32 v0, s2, v0, v3 ; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: v_mov_b32_e32 v4, s1 -; GFX9-NEXT: v_bfi_b32 v3, s0, v3, v4 +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_bfi_b32 v3, s2, v3, v4 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_copysign_v4f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v0, s7 ; GFX11-NEXT: v_mov_b32_e32 v1, s6 diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll index f48961c905f58..542d67486e758 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll @@ -6,7 +6,7 @@ define amdgpu_kernel void @s_test_copysign_f32(ptr addrspace(1) %out, float %mag, float %sign) { ; SI-LABEL: s_test_copysign_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_brev_b32 s8, -2 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 @@ -21,7 +21,7 @@ define amdgpu_kernel void @s_test_copysign_f32(ptr addrspace(1) %out, float %mag ; ; VI-LABEL: s_test_copysign_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_brev_b32 s4, -2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -34,7 +34,7 @@ define amdgpu_kernel void @s_test_copysign_f32(ptr addrspace(1) %out, float %mag ; ; GFX11-LABEL: s_test_copysign_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -51,8 +51,8 @@ define amdgpu_kernel void @s_test_copysign_f32(ptr addrspace(1) %out, float %mag define amdgpu_kernel void @s_test_copysign_f32_0(ptr addrspace(1) %out, float %mag) { ; SI-LABEL: s_test_copysign_f32_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -63,10 +63,10 @@ define amdgpu_kernel void @s_test_copysign_f32_0(ptr addrspace(1) %out, float %m ; ; VI-LABEL: s_test_copysign_f32_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bitset0_b32 s2, 31 +; VI-NEXT: s_and_b32 s2, s4, 0x7fffffff ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -76,10 +76,10 @@ define amdgpu_kernel void @s_test_copysign_f32_0(ptr addrspace(1) %out, float %m ; GFX11-LABEL: s_test_copysign_f32_0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_bitset0_b32 s2, 31 +; GFX11-NEXT: s_and_b32 s2, s4, 0x7fffffff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -94,8 +94,8 @@ define amdgpu_kernel void @s_test_copysign_f32_0(ptr addrspace(1) %out, float %m define amdgpu_kernel void @s_test_copysign_f32_1(ptr addrspace(1) %out, float %mag) { ; SI-LABEL: s_test_copysign_f32_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -106,10 +106,10 @@ define amdgpu_kernel void @s_test_copysign_f32_1(ptr addrspace(1) %out, float %m ; ; VI-LABEL: s_test_copysign_f32_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bitset0_b32 s2, 31 +; VI-NEXT: s_and_b32 s2, s4, 0x7fffffff ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -119,10 +119,10 @@ define amdgpu_kernel void @s_test_copysign_f32_1(ptr addrspace(1) %out, float %m ; GFX11-LABEL: s_test_copysign_f32_1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_bitset0_b32 s2, 31 +; GFX11-NEXT: s_and_b32 s2, s4, 0x7fffffff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -137,8 +137,8 @@ define amdgpu_kernel void @s_test_copysign_f32_1(ptr addrspace(1) %out, float %m define amdgpu_kernel void @s_test_copysign_f32_10.0(ptr addrspace(1) %out, float %mag) { ; SI-LABEL: s_test_copysign_f32_10.0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -149,10 +149,10 @@ define amdgpu_kernel void @s_test_copysign_f32_10.0(ptr addrspace(1) %out, float ; ; VI-LABEL: s_test_copysign_f32_10.0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bitset0_b32 s2, 31 +; VI-NEXT: s_and_b32 s2, s4, 0x7fffffff ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -162,10 +162,10 @@ define amdgpu_kernel void @s_test_copysign_f32_10.0(ptr addrspace(1) %out, float ; GFX11-LABEL: s_test_copysign_f32_10.0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_bitset0_b32 s2, 31 +; GFX11-NEXT: s_and_b32 s2, s4, 0x7fffffff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -180,8 +180,8 @@ define amdgpu_kernel void @s_test_copysign_f32_10.0(ptr addrspace(1) %out, float define amdgpu_kernel void @s_test_copysign_f32_neg1(ptr addrspace(1) %out, float %mag) { ; SI-LABEL: s_test_copysign_f32_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -192,10 +192,10 @@ define amdgpu_kernel void @s_test_copysign_f32_neg1(ptr addrspace(1) %out, float ; ; VI-LABEL: s_test_copysign_f32_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bitset1_b32 s2, 31 +; VI-NEXT: s_or_b32 s2, s4, 0x80000000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -205,10 +205,10 @@ define amdgpu_kernel void @s_test_copysign_f32_neg1(ptr addrspace(1) %out, float ; GFX11-LABEL: s_test_copysign_f32_neg1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_bitset1_b32 s2, 31 +; GFX11-NEXT: s_or_b32 s2, s4, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -223,8 +223,8 @@ define amdgpu_kernel void @s_test_copysign_f32_neg1(ptr addrspace(1) %out, float define amdgpu_kernel void @s_test_copysign_f32_neg10(ptr addrspace(1) %out, float %mag) { ; SI-LABEL: s_test_copysign_f32_neg10: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -235,10 +235,10 @@ define amdgpu_kernel void @s_test_copysign_f32_neg10(ptr addrspace(1) %out, floa ; ; VI-LABEL: s_test_copysign_f32_neg10: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bitset1_b32 s2, 31 +; VI-NEXT: s_or_b32 s2, s4, 0x80000000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -248,10 +248,10 @@ define amdgpu_kernel void @s_test_copysign_f32_neg10(ptr addrspace(1) %out, floa ; GFX11-LABEL: s_test_copysign_f32_neg10: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_bitset1_b32 s2, 31 +; GFX11-NEXT: s_or_b32 s2, s4, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -266,8 +266,8 @@ define amdgpu_kernel void @s_test_copysign_f32_neg10(ptr addrspace(1) %out, floa define amdgpu_kernel void @s_test_copysign_f32_0_mag(ptr addrspace(1) %out, float %sign) { ; SI-LABEL: s_test_copysign_f32_0_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -278,10 +278,10 @@ define amdgpu_kernel void @s_test_copysign_f32_0_mag(ptr addrspace(1) %out, floa ; ; VI-LABEL: s_test_copysign_f32_0_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s2, 0x80000000 +; VI-NEXT: s_and_b32 s2, s4, 0x80000000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -291,10 +291,10 @@ define amdgpu_kernel void @s_test_copysign_f32_0_mag(ptr addrspace(1) %out, floa ; GFX11-LABEL: s_test_copysign_f32_0_mag: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000 +; GFX11-NEXT: s_and_b32 s2, s4, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -310,8 +310,8 @@ define amdgpu_kernel void @s_test_copysign_f32_0_mag(ptr addrspace(1) %out, floa define amdgpu_kernel void @s_test_copysign_f32_1_mag(ptr addrspace(1) %out, float %sign) { ; SI-LABEL: s_test_copysign_f32_1_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -323,10 +323,10 @@ define amdgpu_kernel void @s_test_copysign_f32_1_mag(ptr addrspace(1) %out, floa ; ; VI-LABEL: s_test_copysign_f32_1_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s2, 0x80000000 +; VI-NEXT: s_and_b32 s2, s4, 0x80000000 ; VI-NEXT: s_or_b32 s2, s2, 1.0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -337,10 +337,10 @@ define amdgpu_kernel void @s_test_copysign_f32_1_mag(ptr addrspace(1) %out, floa ; GFX11-LABEL: s_test_copysign_f32_1_mag: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000 +; GFX11-NEXT: s_and_b32 s2, s4, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s2, s2, 1.0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 @@ -356,8 +356,8 @@ define amdgpu_kernel void @s_test_copysign_f32_1_mag(ptr addrspace(1) %out, floa define amdgpu_kernel void @s_test_copysign_f32_10_mag(ptr addrspace(1) %out, float %sign) { ; SI-LABEL: s_test_copysign_f32_10_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -369,10 +369,10 @@ define amdgpu_kernel void @s_test_copysign_f32_10_mag(ptr addrspace(1) %out, flo ; ; VI-LABEL: s_test_copysign_f32_10_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s2, 0x80000000 +; VI-NEXT: s_and_b32 s2, s4, 0x80000000 ; VI-NEXT: s_or_b32 s2, s2, 0x41200000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -383,10 +383,10 @@ define amdgpu_kernel void @s_test_copysign_f32_10_mag(ptr addrspace(1) %out, flo ; GFX11-LABEL: s_test_copysign_f32_10_mag: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000 +; GFX11-NEXT: s_and_b32 s2, s4, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s2, s2, 0x41200000 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 @@ -402,8 +402,8 @@ define amdgpu_kernel void @s_test_copysign_f32_10_mag(ptr addrspace(1) %out, flo define amdgpu_kernel void @s_test_copysign_f32_neg1_mag(ptr addrspace(1) %out, float %sign) { ; SI-LABEL: s_test_copysign_f32_neg1_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -415,10 +415,10 @@ define amdgpu_kernel void @s_test_copysign_f32_neg1_mag(ptr addrspace(1) %out, f ; ; VI-LABEL: s_test_copysign_f32_neg1_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s2, 0x80000000 +; VI-NEXT: s_and_b32 s2, s4, 0x80000000 ; VI-NEXT: s_or_b32 s2, s2, 1.0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -429,10 +429,10 @@ define amdgpu_kernel void @s_test_copysign_f32_neg1_mag(ptr addrspace(1) %out, f ; GFX11-LABEL: s_test_copysign_f32_neg1_mag: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000 +; GFX11-NEXT: s_and_b32 s2, s4, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s2, s2, 1.0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 @@ -448,8 +448,8 @@ define amdgpu_kernel void @s_test_copysign_f32_neg1_mag(ptr addrspace(1) %out, f define amdgpu_kernel void @s_test_copysign_f32_neg10_mag(ptr addrspace(1) %out, float %sign) { ; SI-LABEL: s_test_copysign_f32_neg10_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -461,10 +461,10 @@ define amdgpu_kernel void @s_test_copysign_f32_neg10_mag(ptr addrspace(1) %out, ; ; VI-LABEL: s_test_copysign_f32_neg10_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s2, 0x80000000 +; VI-NEXT: s_and_b32 s2, s4, 0x80000000 ; VI-NEXT: s_or_b32 s2, s2, 0x41200000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -475,10 +475,10 @@ define amdgpu_kernel void @s_test_copysign_f32_neg10_mag(ptr addrspace(1) %out, ; GFX11-LABEL: s_test_copysign_f32_neg10_mag: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000 +; GFX11-NEXT: s_and_b32 s2, s4, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s2, s2, 0x41200000 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 @@ -494,8 +494,8 @@ define amdgpu_kernel void @s_test_copysign_f32_neg10_mag(ptr addrspace(1) %out, define amdgpu_kernel void @s_test_copysign_v2f32(ptr addrspace(1) %out, <2 x float> %mag, <2 x float> %sign) { ; SI-LABEL: s_test_copysign_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_brev_b32 s8, -2 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -511,8 +511,8 @@ define amdgpu_kernel void @s_test_copysign_v2f32(ptr addrspace(1) %out, <2 x flo ; ; VI-LABEL: s_test_copysign_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_brev_b32 s2, -2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s5 @@ -529,8 +529,8 @@ define amdgpu_kernel void @s_test_copysign_v2f32(ptr addrspace(1) %out, <2 x flo ; GFX11-LABEL: s_test_copysign_v2f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s7 ; GFX11-NEXT: v_mov_b32_e32 v2, s6 @@ -549,40 +549,40 @@ define amdgpu_kernel void @s_test_copysign_v2f32(ptr addrspace(1) %out, <2 x flo define amdgpu_kernel void @s_test_copysign_v3f32(ptr addrspace(1) %out, <3 x float> %mag, <3 x float> %sign) { ; SI-LABEL: s_test_copysign_v3f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 +; SI-NEXT: s_brev_b32 s0, -2 +; SI-NEXT: s_mov_b32 s15, 0xf000 +; SI-NEXT: s_mov_b32 s14, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_brev_b32 s7, -2 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s5 ; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: v_bfi_b32 v1, s7, v0, v1 +; SI-NEXT: v_bfi_b32 v1, s0, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v2, s8 -; SI-NEXT: v_bfi_b32 v0, s7, v0, v2 +; SI-NEXT: v_bfi_b32 v0, s0, v0, v2 ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s10 -; SI-NEXT: v_bfi_b32 v2, s7, v2, v3 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:8 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: v_bfi_b32 v2, s0, v2, v3 +; SI-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:8 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[12:15], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_v3f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; VI-NEXT: s_brev_b32 s2, -2 +; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_brev_b32 s7, -2 ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s10 ; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_bfi_b32 v2, s2, v0, v1 +; VI-NEXT: v_bfi_b32 v2, s7, v0, v1 ; VI-NEXT: v_mov_b32_e32 v0, s9 -; VI-NEXT: v_bfi_b32 v1, s2, v3, v0 +; VI-NEXT: v_bfi_b32 v1, s7, v3, v0 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v3, s8 -; VI-NEXT: v_bfi_b32 v0, s2, v0, v3 +; VI-NEXT: v_bfi_b32 v0, s7, v0, v3 ; VI-NEXT: v_mov_b32_e32 v4, s1 ; VI-NEXT: v_mov_b32_e32 v3, s0 ; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2] @@ -591,8 +591,8 @@ define amdgpu_kernel void @s_test_copysign_v3f32(ptr addrspace(1) %out, <3 x flo ; GFX11-LABEL: s_test_copysign_v3f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s10 :: v_dual_mov_b32 v1, s9 @@ -614,45 +614,45 @@ define amdgpu_kernel void @s_test_copysign_v3f32(ptr addrspace(1) %out, <3 x flo define amdgpu_kernel void @s_test_copysign_v4f32(ptr addrspace(1) %out, <4 x float> %mag, <4 x float> %sign) { ; SI-LABEL: s_test_copysign_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd -; SI-NEXT: s_brev_b32 s12, -2 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd +; SI-NEXT: s_brev_b32 s0, -2 +; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s15, 0xf000 +; SI-NEXT: s_mov_b32 s14, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s7 ; SI-NEXT: v_mov_b32_e32 v1, s11 -; SI-NEXT: v_bfi_b32 v3, s12, v0, v1 +; SI-NEXT: v_bfi_b32 v3, s0, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: v_mov_b32_e32 v1, s10 -; SI-NEXT: v_bfi_b32 v2, s12, v0, v1 +; SI-NEXT: v_bfi_b32 v2, s0, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s5 ; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: v_bfi_b32 v1, s12, v0, v1 +; SI-NEXT: v_bfi_b32 v1, s0, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_bfi_b32 v0, s12, v0, v4 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: v_bfi_b32 v0, s0, v0, v4 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; VI-NEXT: s_brev_b32 s2, -2 +; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_brev_b32 s12, -2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s7 ; VI-NEXT: v_mov_b32_e32 v1, s11 ; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_bfi_b32 v3, s2, v0, v1 +; VI-NEXT: v_bfi_b32 v3, s12, v0, v1 ; VI-NEXT: v_mov_b32_e32 v0, s10 -; VI-NEXT: v_bfi_b32 v2, s2, v2, v0 +; VI-NEXT: v_bfi_b32 v2, s12, v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s5 ; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: v_bfi_b32 v1, s2, v0, v1 +; VI-NEXT: v_bfi_b32 v1, s12, v0, v1 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v4, s8 -; VI-NEXT: v_bfi_b32 v0, s2, v0, v4 +; VI-NEXT: v_bfi_b32 v0, s12, v0, v4 ; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -661,8 +661,8 @@ define amdgpu_kernel void @s_test_copysign_v4f32(ptr addrspace(1) %out, <4 x flo ; GFX11-LABEL: s_test_copysign_v4f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v6, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s11 :: v_dual_mov_b32 v1, s10 @@ -906,46 +906,46 @@ define <5 x float> @v_test_copysign_v5f32(<5 x float> %mag, <5 x float> %sign) { define amdgpu_kernel void @s_test_copysign_f32_fptrunc_f64(ptr addrspace(1) %out, float %mag, double %sign) { ; SI-LABEL: s_test_copysign_f32_fptrunc_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: v_bfi_b32 v0, s0, v0, v1 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_brev_b32 s4, -2 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_bfi_b32 v0, s4, v0, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_f32_fptrunc_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_brev_b32 s2, -2 +; VI-NEXT: s_brev_b32 s0, -2 ; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_bfi_b32 v2, s2, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_bfi_b32 v2, s0, v0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f32_fptrunc_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s2, v0 -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s0, v0 +; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -958,7 +958,7 @@ define amdgpu_kernel void @s_test_copysign_f32_fptrunc_f64(ptr addrspace(1) %out define amdgpu_kernel void @s_test_copysign_f32_1_fptrunc_f64(ptr addrspace(1) %out, double %sign) { ; SI-LABEL: s_test_copysign_f32_1_fptrunc_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -972,7 +972,7 @@ define amdgpu_kernel void @s_test_copysign_f32_1_fptrunc_f64(ptr addrspace(1) %o ; ; VI-LABEL: s_test_copysign_f32_1_fptrunc_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_and_b32 s0, s3, 0x80000000 @@ -984,7 +984,7 @@ define amdgpu_kernel void @s_test_copysign_f32_1_fptrunc_f64(ptr addrspace(1) %o ; ; GFX11-LABEL: s_test_copysign_f32_1_fptrunc_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s2, s3, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) @@ -1003,7 +1003,7 @@ define amdgpu_kernel void @s_test_copysign_f32_1_fptrunc_f64(ptr addrspace(1) %o define amdgpu_kernel void @s_test_copysign_f32_fpext_f16(ptr addrspace(1) %out, float %mag, half %sign) { ; SI-LABEL: s_test_copysign_f32_fpext_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1018,7 +1018,7 @@ define amdgpu_kernel void @s_test_copysign_f32_fpext_f16(ptr addrspace(1) %out, ; ; VI-LABEL: s_test_copysign_f32_fpext_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_brev_b32 s4, -2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_lshlrev_b32_e64 v0, 16, s3 @@ -1031,7 +1031,7 @@ define amdgpu_kernel void @s_test_copysign_f32_fpext_f16(ptr addrspace(1) %out, ; ; GFX11-LABEL: s_test_copysign_f32_fpext_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s3 @@ -1050,23 +1050,24 @@ define amdgpu_kernel void @s_test_copysign_f32_fpext_f16(ptr addrspace(1) %out, define amdgpu_kernel void @s_test_copysign_f32_1_fpext_f16(ptr addrspace(1) %out, half %sign) { ; SI-LABEL: s_test_copysign_f32_1_fpext_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dword s0, s[2:3], 0xb ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_and_b32_e32 v0, 0x80000000, v0 ; SI-NEXT: v_or_b32_e32 v0, 1.0, v0 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_f32_1_fpext_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s2, s2, 16 +; VI-NEXT: s_lshl_b32 s2, s4, 16 ; VI-NEXT: s_and_b32 s2, s2, 0x80000000 ; VI-NEXT: s_or_b32 s2, s2, 1.0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1078,10 +1079,10 @@ define amdgpu_kernel void @s_test_copysign_f32_1_fpext_f16(ptr addrspace(1) %out ; GFX11-LABEL: s_test_copysign_f32_1_fpext_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-NEXT: s_lshl_b32 s2, s4, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000 ; GFX11-NEXT: s_or_b32 s2, s2, 1.0 @@ -1100,7 +1101,7 @@ define amdgpu_kernel void @s_test_copysign_f32_1_fpext_f16(ptr addrspace(1) %out define amdgpu_kernel void @s_test_copysign_f32_fpext_bf16(ptr addrspace(1) %out, float %mag, bfloat %sign) { ; SI-LABEL: s_test_copysign_f32_fpext_bf16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1116,7 +1117,7 @@ define amdgpu_kernel void @s_test_copysign_f32_fpext_bf16(ptr addrspace(1) %out, ; ; VI-LABEL: s_test_copysign_f32_fpext_bf16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_brev_b32 s4, -2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_lshlrev_b32_e64 v0, 16, s3 @@ -1129,7 +1130,7 @@ define amdgpu_kernel void @s_test_copysign_f32_fpext_bf16(ptr addrspace(1) %out, ; ; GFX11-LABEL: s_test_copysign_f32_fpext_bf16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s3 diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.ll b/llvm/test/CodeGen/AMDGPU/fdiv.ll index 6c5b2917855fc..93105e57a5918 100644 --- a/llvm/test/CodeGen/AMDGPU/fdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv.ll @@ -16,7 +16,7 @@ define amdgpu_kernel void @s_fdiv_f32_ninf(ptr addrspace(1) %out, float %a, float %b) #0 { ; GFX6-FASTFMA-LABEL: s_fdiv_f32_ninf: ; GFX6-FASTFMA: ; %bb.0: ; %entry -; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-FASTFMA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-FASTFMA-NEXT: s_mov_b32 s6, -1 ; GFX6-FASTFMA-NEXT: s_waitcnt lgkmcnt(0) @@ -42,7 +42,7 @@ define amdgpu_kernel void @s_fdiv_f32_ninf(ptr addrspace(1) %out, float %a, floa ; ; GFX6-SLOWFMA-LABEL: s_fdiv_f32_ninf: ; GFX6-SLOWFMA: ; %bb.0: ; %entry -; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s6, -1 ; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) @@ -68,7 +68,7 @@ define amdgpu_kernel void @s_fdiv_f32_ninf(ptr addrspace(1) %out, float %a, floa ; ; GFX7-LABEL: s_fdiv_f32_ninf: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -94,7 +94,7 @@ define amdgpu_kernel void @s_fdiv_f32_ninf(ptr addrspace(1) %out, float %a, floa ; ; GFX8-LABEL: s_fdiv_f32_ninf: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_div_scale_f32 v1, s[4:5], s3, s3, v0 @@ -118,7 +118,7 @@ define amdgpu_kernel void @s_fdiv_f32_ninf(ptr addrspace(1) %out, float %a, floa ; ; GFX10-LABEL: s_fdiv_f32_ninf: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_div_scale_f32 v0, s0, s7, s7, s6 ; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s6, s7, s6 @@ -139,7 +139,7 @@ define amdgpu_kernel void @s_fdiv_f32_ninf(ptr addrspace(1) %out, float %a, floa ; ; GFX11-LABEL: s_fdiv_f32_ninf: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v0, null, s3, s3, s2 ; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, s2, s3, s2 @@ -181,7 +181,7 @@ entry: define amdgpu_kernel void @s_fdiv_f32_ieee(ptr addrspace(1) %out, float %a, float %b) #1 { ; GFX6-FASTFMA-LABEL: s_fdiv_f32_ieee: ; GFX6-FASTFMA: ; %bb.0: ; %entry -; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-FASTFMA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-FASTFMA-NEXT: s_mov_b32 s6, -1 ; GFX6-FASTFMA-NEXT: s_waitcnt lgkmcnt(0) @@ -205,7 +205,7 @@ define amdgpu_kernel void @s_fdiv_f32_ieee(ptr addrspace(1) %out, float %a, floa ; ; GFX6-SLOWFMA-LABEL: s_fdiv_f32_ieee: ; GFX6-SLOWFMA: ; %bb.0: ; %entry -; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s6, -1 ; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) @@ -229,7 +229,7 @@ define amdgpu_kernel void @s_fdiv_f32_ieee(ptr addrspace(1) %out, float %a, floa ; ; GFX7-LABEL: s_fdiv_f32_ieee: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -253,7 +253,7 @@ define amdgpu_kernel void @s_fdiv_f32_ieee(ptr addrspace(1) %out, float %a, floa ; ; GFX8-LABEL: s_fdiv_f32_ieee: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_div_scale_f32 v1, s[4:5], s3, s3, v0 @@ -275,7 +275,7 @@ define amdgpu_kernel void @s_fdiv_f32_ieee(ptr addrspace(1) %out, float %a, floa ; ; GFX10-LABEL: s_fdiv_f32_ieee: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_div_scale_f32 v0, s0, s7, s7, s6 ; GFX10-NEXT: v_rcp_f32_e32 v1, v0 @@ -294,7 +294,7 @@ define amdgpu_kernel void @s_fdiv_f32_ieee(ptr addrspace(1) %out, float %a, floa ; ; GFX11-LABEL: s_fdiv_f32_ieee: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v0, null, s3, s3, s2 ; GFX11-NEXT: v_rcp_f32_e32 v1, v0 @@ -334,7 +334,7 @@ entry: define amdgpu_kernel void @s_fdiv_25ulp_f32(ptr addrspace(1) %out, float %a, float %b) #0 { ; GFX67-LABEL: s_fdiv_25ulp_f32: ; GFX67: ; %bb.0: ; %entry -; GFX67-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX67-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX67-NEXT: v_mov_b32_e32 v0, 0x6f800000 ; GFX67-NEXT: v_mov_b32_e32 v1, 0x2f800000 ; GFX67-NEXT: s_mov_b32 s7, 0xf000 @@ -353,7 +353,7 @@ define amdgpu_kernel void @s_fdiv_25ulp_f32(ptr addrspace(1) %out, float %a, flo ; ; GFX8-LABEL: s_fdiv_25ulp_f32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v0, 0x6f800000 ; GFX8-NEXT: v_mov_b32_e32 v1, 0x2f800000 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -370,7 +370,7 @@ define amdgpu_kernel void @s_fdiv_25ulp_f32(ptr addrspace(1) %out, float %a, flo ; ; GFX10-LABEL: s_fdiv_25ulp_f32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cmp_lt_f32_e64 s0, 0x6f800000, |s7| @@ -384,7 +384,7 @@ define amdgpu_kernel void @s_fdiv_25ulp_f32(ptr addrspace(1) %out, float %a, flo ; ; GFX11-LABEL: s_fdiv_25ulp_f32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_lt_f32_e64 s4, 0x6f800000, |s3| @@ -420,7 +420,7 @@ entry: define amdgpu_kernel void @s_fdiv_25ulp_ieee_f32(ptr addrspace(1) %out, float %a, float %b) #1 { ; GFX6-LABEL: s_fdiv_25ulp_ieee_f32: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v0, 0x7f800000 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 @@ -446,7 +446,7 @@ define amdgpu_kernel void @s_fdiv_25ulp_ieee_f32(ptr addrspace(1) %out, float %a ; ; GFX7-LABEL: s_fdiv_25ulp_ieee_f32: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -465,7 +465,7 @@ define amdgpu_kernel void @s_fdiv_25ulp_ieee_f32(ptr addrspace(1) %out, float %a ; ; GFX8-LABEL: s_fdiv_25ulp_ieee_f32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_frexp_mant_f32_e32 v1, s3 ; GFX8-NEXT: v_rcp_f32_e32 v1, v1 @@ -482,7 +482,7 @@ define amdgpu_kernel void @s_fdiv_25ulp_ieee_f32(ptr addrspace(1) %out, float %a ; ; GFX10-LABEL: s_fdiv_25ulp_ieee_f32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_frexp_mant_f32_e32 v0, s7 ; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v1, s7 @@ -498,7 +498,7 @@ define amdgpu_kernel void @s_fdiv_25ulp_ieee_f32(ptr addrspace(1) %out, float %a ; ; GFX11-LABEL: s_fdiv_25ulp_ieee_f32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_frexp_mant_f32_e32 v0, s3 ; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v1, s3 @@ -535,7 +535,7 @@ entry: define amdgpu_kernel void @s_fdiv_fast_ieee_f32(ptr addrspace(1) %out, float %a, float %b) #1 { ; GFX67-LABEL: s_fdiv_fast_ieee_f32: ; GFX67: ; %bb.0: ; %entry -; GFX67-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX67-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX67-NEXT: s_mov_b32 s7, 0xf000 ; GFX67-NEXT: s_mov_b32 s6, -1 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) @@ -548,7 +548,7 @@ define amdgpu_kernel void @s_fdiv_fast_ieee_f32(ptr addrspace(1) %out, float %a, ; ; GFX8-LABEL: s_fdiv_fast_ieee_f32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_rcp_f32_e32 v0, s3 ; GFX8-NEXT: v_mul_f32_e32 v2, s2, v0 @@ -559,7 +559,7 @@ define amdgpu_kernel void @s_fdiv_fast_ieee_f32(ptr addrspace(1) %out, float %a, ; ; GFX10-LABEL: s_fdiv_fast_ieee_f32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_rcp_f32_e32 v0, s7 @@ -569,7 +569,7 @@ define amdgpu_kernel void @s_fdiv_fast_ieee_f32(ptr addrspace(1) %out, float %a, ; ; GFX11-LABEL: s_fdiv_fast_ieee_f32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rcp_f32_e32 v0, s3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff @@ -599,7 +599,7 @@ entry: define amdgpu_kernel void @s_fdiv_f32_fast_math(ptr addrspace(1) %out, float %a, float %b) #0 { ; GFX67-LABEL: s_fdiv_f32_fast_math: ; GFX67: ; %bb.0: ; %entry -; GFX67-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX67-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX67-NEXT: s_mov_b32 s7, 0xf000 ; GFX67-NEXT: s_mov_b32 s6, -1 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) @@ -612,7 +612,7 @@ define amdgpu_kernel void @s_fdiv_f32_fast_math(ptr addrspace(1) %out, float %a, ; ; GFX8-LABEL: s_fdiv_f32_fast_math: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_rcp_f32_e32 v0, s3 ; GFX8-NEXT: v_mul_f32_e32 v2, s2, v0 @@ -623,7 +623,7 @@ define amdgpu_kernel void @s_fdiv_f32_fast_math(ptr addrspace(1) %out, float %a, ; ; GFX10-LABEL: s_fdiv_f32_fast_math: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_rcp_f32_e32 v0, s7 @@ -633,7 +633,7 @@ define amdgpu_kernel void @s_fdiv_f32_fast_math(ptr addrspace(1) %out, float %a, ; ; GFX11-LABEL: s_fdiv_f32_fast_math: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rcp_f32_e32 v0, s3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff @@ -663,7 +663,7 @@ entry: define amdgpu_kernel void @s_fdiv_ulp25_f32_fast_math(ptr addrspace(1) %out, float %a, float %b) #0 { ; GFX67-LABEL: s_fdiv_ulp25_f32_fast_math: ; GFX67: ; %bb.0: ; %entry -; GFX67-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX67-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX67-NEXT: s_mov_b32 s7, 0xf000 ; GFX67-NEXT: s_mov_b32 s6, -1 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) @@ -676,7 +676,7 @@ define amdgpu_kernel void @s_fdiv_ulp25_f32_fast_math(ptr addrspace(1) %out, flo ; ; GFX8-LABEL: s_fdiv_ulp25_f32_fast_math: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_rcp_f32_e32 v0, s3 ; GFX8-NEXT: v_mul_f32_e32 v2, s2, v0 @@ -687,7 +687,7 @@ define amdgpu_kernel void @s_fdiv_ulp25_f32_fast_math(ptr addrspace(1) %out, flo ; ; GFX10-LABEL: s_fdiv_ulp25_f32_fast_math: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_rcp_f32_e32 v0, s7 @@ -697,7 +697,7 @@ define amdgpu_kernel void @s_fdiv_ulp25_f32_fast_math(ptr addrspace(1) %out, flo ; ; GFX11-LABEL: s_fdiv_ulp25_f32_fast_math: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rcp_f32_e32 v0, s3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff @@ -727,7 +727,7 @@ entry: define amdgpu_kernel void @s_fdiv_f32_arcp_daz(ptr addrspace(1) %out, float %a, float %b) #0 { ; GFX6-FASTFMA-LABEL: s_fdiv_f32_arcp_daz: ; GFX6-FASTFMA: ; %bb.0: ; %entry -; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-FASTFMA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-FASTFMA-NEXT: s_mov_b32 s6, -1 ; GFX6-FASTFMA-NEXT: s_waitcnt lgkmcnt(0) @@ -753,7 +753,7 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_daz(ptr addrspace(1) %out, float %a, ; ; GFX6-SLOWFMA-LABEL: s_fdiv_f32_arcp_daz: ; GFX6-SLOWFMA: ; %bb.0: ; %entry -; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s6, -1 ; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) @@ -779,7 +779,7 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_daz(ptr addrspace(1) %out, float %a, ; ; GFX7-LABEL: s_fdiv_f32_arcp_daz: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -805,7 +805,7 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_daz(ptr addrspace(1) %out, float %a, ; ; GFX8-LABEL: s_fdiv_f32_arcp_daz: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_div_scale_f32 v1, s[4:5], s3, s3, v0 @@ -829,7 +829,7 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_daz(ptr addrspace(1) %out, float %a, ; ; GFX10-LABEL: s_fdiv_f32_arcp_daz: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_div_scale_f32 v0, s0, s7, s7, s6 ; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s6, s7, s6 @@ -850,7 +850,7 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_daz(ptr addrspace(1) %out, float %a, ; ; GFX11-LABEL: s_fdiv_f32_arcp_daz: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v0, null, s3, s3, s2 ; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, s2, s3, s2 @@ -892,7 +892,7 @@ entry: define amdgpu_kernel void @s_fdiv_f32_arcp_ninf(ptr addrspace(1) %out, float %a, float %b) #0 { ; GFX67-LABEL: s_fdiv_f32_arcp_ninf: ; GFX67: ; %bb.0: ; %entry -; GFX67-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX67-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX67-NEXT: s_mov_b32 s7, 0xf000 ; GFX67-NEXT: s_mov_b32 s6, -1 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) @@ -905,7 +905,7 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_ninf(ptr addrspace(1) %out, float %a, ; ; GFX8-LABEL: s_fdiv_f32_arcp_ninf: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_rcp_f32_e32 v0, s3 ; GFX8-NEXT: v_mul_f32_e32 v2, s2, v0 @@ -916,7 +916,7 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_ninf(ptr addrspace(1) %out, float %a, ; ; GFX10-LABEL: s_fdiv_f32_arcp_ninf: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_rcp_f32_e32 v0, s7 @@ -926,7 +926,7 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_ninf(ptr addrspace(1) %out, float %a, ; ; GFX11-LABEL: s_fdiv_f32_arcp_ninf: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rcp_f32_e32 v0, s3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff @@ -956,8 +956,8 @@ entry: define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <2 x float> %b) #0 { ; GFX6-FASTFMA-LABEL: s_fdiv_v2f32: ; GFX6-FASTFMA: ; %bb.0: ; %entry -; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; GFX6-FASTFMA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GFX6-FASTFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-FASTFMA-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-FASTFMA-NEXT: s_mov_b32 s2, -1 ; GFX6-FASTFMA-NEXT: s_waitcnt lgkmcnt(0) @@ -996,11 +996,10 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; ; GFX6-SLOWFMA-LABEL: s_fdiv_v2f32: ; GFX6-SLOWFMA: ; %bb.0: ; %entry -; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; GFX6-SLOWFMA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb ; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v0, s5 -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v1, s[2:3], s7, s7, v0 +; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v1, s[0:1], s7, s7, v0 ; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v2, s7 ; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, vcc, s5, v2, s5 ; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v4, s4 @@ -1013,10 +1012,11 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v1, -v1, v5, v2 ; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, s[2:3], s6, s6, v4 +; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, s[0:1], s6, s6, v4 ; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v1, v1, v3, v5 ; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v3, s6 ; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v3, vcc, s4, v3, s4 +; GFX6-SLOWFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s2, -1 ; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v5, v2 @@ -1031,13 +1031,14 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v0, v2, v0, v5 ; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v0, s6, v4 +; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-SLOWFMA-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-SLOWFMA-NEXT: s_endpgm ; ; GFX7-LABEL: s_fdiv_v2f32: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1076,11 +1077,10 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; ; GFX8-LABEL: s_fdiv_v2f32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s5 -; GFX8-NEXT: v_div_scale_f32 v1, s[2:3], s7, s7, v0 +; GFX8-NEXT: v_div_scale_f32 v1, s[0:1], s7, s7, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, s7 ; GFX8-NEXT: v_div_scale_f32 v2, vcc, s5, v2, s5 ; GFX8-NEXT: v_mov_b32_e32 v4, s4 @@ -1093,10 +1093,11 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX8-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX8-NEXT: v_fma_f32 v1, -v1, v5, v2 ; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], s6, s6, v4 +; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], s6, s6, v4 ; GFX8-NEXT: v_div_fmas_f32 v1, v1, v3, v5 ; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: v_div_scale_f32 v3, vcc, s4, v3, s4 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_rcp_f32_e32 v5, v2 ; GFX8-NEXT: v_div_fixup_f32 v1, v1, s7, v0 ; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -1108,6 +1109,7 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX8-NEXT: v_fma_f32 v2, -v2, v5, v3 ; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX8-NEXT: v_div_fmas_f32 v0, v2, v0, v5 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: v_div_fixup_f32 v0, v0, s6, v4 @@ -1116,11 +1118,10 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; ; GFX10-LABEL: s_fdiv_v2f32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s2, s7, s7, s5 +; GFX10-NEXT: v_div_scale_f32 v0, s0, s7, s7, s5 ; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s5, s7, s5 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_rcp_f32_e32 v1, v0 ; GFX10-NEXT: s_denorm_mode 15 ; GFX10-NEXT: v_fma_f32 v3, -v0, v1, 1.0 @@ -1130,8 +1131,9 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v1 ; GFX10-NEXT: v_fma_f32 v0, -v0, v3, v2 ; GFX10-NEXT: s_denorm_mode 12 -; GFX10-NEXT: v_div_scale_f32 v2, s2, s6, s6, s4 +; GFX10-NEXT: v_div_scale_f32 v2, s0, s6, s6, s4 ; GFX10-NEXT: v_div_fmas_f32 v0, v0, v1, v3 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_rcp_f32_e32 v3, v2 ; GFX10-NEXT: v_div_fixup_f32 v1, v0, s7, s5 ; GFX10-NEXT: v_div_scale_f32 v0, vcc_lo, s4, s6, s4 @@ -1153,8 +1155,8 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX11-LABEL: s_fdiv_v2f32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v0, null, s7, s7, s5 ; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, s5, s7, s5 @@ -1212,8 +1214,8 @@ entry: define amdgpu_kernel void @s_fdiv_ulp25_v2f32(ptr addrspace(1) %out, <2 x float> %a, <2 x float> %b) #0 { ; GFX67-LABEL: s_fdiv_ulp25_v2f32: ; GFX67: ; %bb.0: ; %entry -; GFX67-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; GFX67-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX67-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GFX67-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX67-NEXT: s_mov_b32 s3, 0xf000 ; GFX67-NEXT: s_mov_b32 s2, -1 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) @@ -1226,8 +1228,8 @@ define amdgpu_kernel void @s_fdiv_ulp25_v2f32(ptr addrspace(1) %out, <2 x float> ; ; GFX8-LABEL: s_fdiv_ulp25_v2f32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_rcp_f32_e32 v0, s6 ; GFX8-NEXT: v_rcp_f32_e32 v1, s7 @@ -1241,22 +1243,22 @@ define amdgpu_kernel void @s_fdiv_ulp25_v2f32(ptr addrspace(1) %out, <2 x float> ; GFX10-LABEL: s_fdiv_ulp25_v2f32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_rcp_f32_e32 v0, s6 ; GFX10-NEXT: v_rcp_f32_e32 v1, s7 ; GFX10-NEXT: v_mul_f32_e32 v0, s4, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, s5, v1 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_ulp25_v2f32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rcp_f32_e32 v0, s6 ; GFX11-NEXT: v_rcp_f32_e32 v1, s7 @@ -1290,8 +1292,8 @@ entry: define amdgpu_kernel void @s_fdiv_v2f32_fast_math(ptr addrspace(1) %out, <2 x float> %a, <2 x float> %b) #0 { ; GFX67-LABEL: s_fdiv_v2f32_fast_math: ; GFX67: ; %bb.0: ; %entry -; GFX67-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; GFX67-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX67-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GFX67-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX67-NEXT: s_mov_b32 s3, 0xf000 ; GFX67-NEXT: s_mov_b32 s2, -1 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) @@ -1304,8 +1306,8 @@ define amdgpu_kernel void @s_fdiv_v2f32_fast_math(ptr addrspace(1) %out, <2 x fl ; ; GFX8-LABEL: s_fdiv_v2f32_fast_math: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_rcp_f32_e32 v0, s7 ; GFX8-NEXT: v_rcp_f32_e32 v2, s6 @@ -1319,22 +1321,22 @@ define amdgpu_kernel void @s_fdiv_v2f32_fast_math(ptr addrspace(1) %out, <2 x fl ; GFX10-LABEL: s_fdiv_v2f32_fast_math: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_rcp_f32_e32 v0, s7 ; GFX10-NEXT: v_rcp_f32_e32 v2, s6 ; GFX10-NEXT: v_mul_f32_e32 v1, s5, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, s4, v2 -; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[2:3] +; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_v2f32_fast_math: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rcp_f32_e32 v0, s7 ; GFX11-NEXT: v_rcp_f32_e32 v2, s6 @@ -1368,8 +1370,8 @@ entry: define amdgpu_kernel void @s_fdiv_v2f32_arcp_math(ptr addrspace(1) %out, <2 x float> %a, <2 x float> %b) #0 { ; GFX67-LABEL: s_fdiv_v2f32_arcp_math: ; GFX67: ; %bb.0: ; %entry -; GFX67-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; GFX67-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX67-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GFX67-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX67-NEXT: s_mov_b32 s3, 0xf000 ; GFX67-NEXT: s_mov_b32 s2, -1 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) @@ -1382,8 +1384,8 @@ define amdgpu_kernel void @s_fdiv_v2f32_arcp_math(ptr addrspace(1) %out, <2 x fl ; ; GFX8-LABEL: s_fdiv_v2f32_arcp_math: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_rcp_f32_e32 v0, s7 ; GFX8-NEXT: v_rcp_f32_e32 v2, s6 @@ -1397,22 +1399,22 @@ define amdgpu_kernel void @s_fdiv_v2f32_arcp_math(ptr addrspace(1) %out, <2 x fl ; GFX10-LABEL: s_fdiv_v2f32_arcp_math: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_rcp_f32_e32 v0, s7 ; GFX10-NEXT: v_rcp_f32_e32 v2, s6 ; GFX10-NEXT: v_mul_f32_e32 v1, s5, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, s4, v2 -; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[2:3] +; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_v2f32_arcp_math: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rcp_f32_e32 v0, s7 ; GFX11-NEXT: v_rcp_f32_e32 v2, s6 @@ -1446,7 +1448,7 @@ entry: define amdgpu_kernel void @s_fdiv_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-FASTFMA-LABEL: s_fdiv_v4f32: ; GFX6-FASTFMA: ; %bb.0: -; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 ; GFX6-FASTFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-FASTFMA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX6-FASTFMA-NEXT: s_mov_b32 s11, 0xf000 @@ -1517,7 +1519,7 @@ define amdgpu_kernel void @s_fdiv_v4f32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX6-SLOWFMA-LABEL: s_fdiv_v4f32: ; GFX6-SLOWFMA: ; %bb.0: -; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 ; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-SLOWFMA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) @@ -1588,7 +1590,7 @@ define amdgpu_kernel void @s_fdiv_v4f32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX7-LABEL: s_fdiv_v4f32: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX7-NEXT: s_mov_b32 s11, 0xf000 @@ -1659,7 +1661,7 @@ define amdgpu_kernel void @s_fdiv_v4f32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX8-LABEL: s_fdiv_v4f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1730,7 +1732,7 @@ define amdgpu_kernel void @s_fdiv_v4f32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX10-LABEL: s_fdiv_v4f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1792,7 +1794,7 @@ define amdgpu_kernel void @s_fdiv_v4f32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX11-LABEL: s_fdiv_v4f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1893,7 +1895,7 @@ define amdgpu_kernel void @s_fdiv_v4f32(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @s_fdiv_v4f32_fast_math(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX67-LABEL: s_fdiv_v4f32_fast_math: ; GFX67: ; %bb.0: -; GFX67-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; GFX67-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) ; GFX67-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX67-NEXT: s_mov_b32 s11, 0xf000 @@ -1912,7 +1914,7 @@ define amdgpu_kernel void @s_fdiv_v4f32_fast_math(ptr addrspace(1) %out, ptr add ; ; GFX8-LABEL: s_fdiv_v4f32_fast_math: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v4, s8 @@ -1931,7 +1933,7 @@ define amdgpu_kernel void @s_fdiv_v4f32_fast_math(ptr addrspace(1) %out, ptr add ; ; GFX10-LABEL: s_fdiv_v4f32_fast_math: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -1949,7 +1951,7 @@ define amdgpu_kernel void @s_fdiv_v4f32_fast_math(ptr addrspace(1) %out, ptr add ; ; GFX11-LABEL: s_fdiv_v4f32_fast_math: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -2001,7 +2003,7 @@ define amdgpu_kernel void @s_fdiv_v4f32_fast_math(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @s_fdiv_v4f32_arcp_math(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX67-LABEL: s_fdiv_v4f32_arcp_math: ; GFX67: ; %bb.0: -; GFX67-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; GFX67-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) ; GFX67-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX67-NEXT: s_mov_b32 s11, 0xf000 @@ -2020,7 +2022,7 @@ define amdgpu_kernel void @s_fdiv_v4f32_arcp_math(ptr addrspace(1) %out, ptr add ; ; GFX8-LABEL: s_fdiv_v4f32_arcp_math: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v4, s8 @@ -2039,7 +2041,7 @@ define amdgpu_kernel void @s_fdiv_v4f32_arcp_math(ptr addrspace(1) %out, ptr add ; ; GFX10-LABEL: s_fdiv_v4f32_arcp_math: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -2057,7 +2059,7 @@ define amdgpu_kernel void @s_fdiv_v4f32_arcp_math(ptr addrspace(1) %out, ptr add ; ; GFX11-LABEL: s_fdiv_v4f32_arcp_math: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -2109,8 +2111,8 @@ define amdgpu_kernel void @s_fdiv_v4f32_arcp_math(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspace(1) %out, float %a) #0 { ; GFX6-FASTFMA-LABEL: s_fdiv_f32_correctly_rounded_divide_sqrt: ; GFX6-FASTFMA: ; %bb.0: ; %entry -; GFX6-FASTFMA-NEXT: s_load_dword s6, s[0:1], 0xb -; GFX6-FASTFMA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-FASTFMA-NEXT: s_load_dword s6, s[2:3], 0xb +; GFX6-FASTFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-FASTFMA-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-FASTFMA-NEXT: s_mov_b32 s2, -1 ; GFX6-FASTFMA-NEXT: s_waitcnt lgkmcnt(0) @@ -2132,11 +2134,11 @@ define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspac ; ; GFX6-SLOWFMA-LABEL: s_fdiv_f32_correctly_rounded_divide_sqrt: ; GFX6-SLOWFMA: ; %bb.0: ; %entry -; GFX6-SLOWFMA-NEXT: s_load_dword s4, s[0:1], 0xb -; GFX6-SLOWFMA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-SLOWFMA-NEXT: s_load_dword s4, s[2:3], 0xb ; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v0, s[2:3], s4, s4, 1.0 +; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v0, s[0:1], s4, s4, 1.0 ; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v1, vcc, 1.0, s4, 1.0 +; GFX6-SLOWFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s2, -1 ; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v2, v0 @@ -2150,13 +2152,14 @@ define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspac ; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v0, v0, v2, v3 ; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v0, s4, 1.0 +; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-SLOWFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-SLOWFMA-NEXT: s_endpgm ; ; GFX7-LABEL: s_fdiv_f32_correctly_rounded_divide_sqrt: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s6, s[0:1], 0xb -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7-NEXT: s_load_dword s6, s[2:3], 0xb +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2178,11 +2181,11 @@ define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspac ; ; GFX8-LABEL: s_fdiv_f32_correctly_rounded_divide_sqrt: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_div_scale_f32 v0, s[2:3], s4, s4, 1.0 +; GFX8-NEXT: v_div_scale_f32 v0, s[0:1], s4, s4, 1.0 ; GFX8-NEXT: v_div_scale_f32 v1, vcc, 1.0, s4, 1.0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_rcp_f32_e32 v2, v0 ; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX8-NEXT: v_fma_f32 v3, -v0, v2, 1.0 @@ -2194,6 +2197,7 @@ define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspac ; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX8-NEXT: v_div_fmas_f32 v0, v0, v2, v3 ; GFX8-NEXT: v_div_fixup_f32 v2, v0, s4, 1.0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -2201,11 +2205,11 @@ define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspac ; ; GFX10-LABEL: s_fdiv_f32_correctly_rounded_divide_sqrt: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s3, s2, s2, 1.0 -; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s2, 1.0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_div_scale_f32 v0, s0, s4, s4, 1.0 +; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s4, 1.0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_rcp_f32_e32 v1, v0 ; GFX10-NEXT: s_denorm_mode 15 ; GFX10-NEXT: v_fma_f32 v3, -v0, v1, 1.0 @@ -2217,7 +2221,7 @@ define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspac ; GFX10-NEXT: s_denorm_mode 12 ; GFX10-NEXT: v_div_fmas_f32 v0, v0, v1, v3 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_div_fixup_f32 v0, v0, s2, 1.0 +; GFX10-NEXT: v_div_fixup_f32 v0, v0, s4, 1.0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm @@ -2225,11 +2229,11 @@ define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspac ; GFX11-LABEL: s_fdiv_f32_correctly_rounded_divide_sqrt: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v0, null, s2, s2, 1.0 -; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s2, 1.0 +; GFX11-NEXT: v_div_scale_f32 v0, null, s4, s4, 1.0 +; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s4, 1.0 ; GFX11-NEXT: v_rcp_f32_e32 v1, v0 ; GFX11-NEXT: s_denorm_mode 15 ; GFX11-NEXT: s_waitcnt_depctr 0xfff @@ -2242,7 +2246,7 @@ define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspac ; GFX11-NEXT: s_denorm_mode 12 ; GFX11-NEXT: v_div_fmas_f32 v0, v0, v1, v3 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: v_div_fixup_f32 v0, v0, s2, 1.0 +; GFX11-NEXT: v_div_fixup_f32 v0, v0, s4, 1.0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -2267,8 +2271,8 @@ entry: define amdgpu_kernel void @s_fdiv_f32_denorms_correctly_rounded_divide_sqrt(ptr addrspace(1) %out, float %a) #1 { ; GFX6-FASTFMA-LABEL: s_fdiv_f32_denorms_correctly_rounded_divide_sqrt: ; GFX6-FASTFMA: ; %bb.0: ; %entry -; GFX6-FASTFMA-NEXT: s_load_dword s6, s[0:1], 0xb -; GFX6-FASTFMA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-FASTFMA-NEXT: s_load_dword s6, s[2:3], 0xb +; GFX6-FASTFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-FASTFMA-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-FASTFMA-NEXT: s_mov_b32 s2, -1 ; GFX6-FASTFMA-NEXT: s_waitcnt lgkmcnt(0) @@ -2288,11 +2292,11 @@ define amdgpu_kernel void @s_fdiv_f32_denorms_correctly_rounded_divide_sqrt(ptr ; ; GFX6-SLOWFMA-LABEL: s_fdiv_f32_denorms_correctly_rounded_divide_sqrt: ; GFX6-SLOWFMA: ; %bb.0: ; %entry -; GFX6-SLOWFMA-NEXT: s_load_dword s4, s[0:1], 0xb -; GFX6-SLOWFMA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-SLOWFMA-NEXT: s_load_dword s4, s[2:3], 0xb ; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v0, s[2:3], s4, s4, 1.0 +; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v0, s[0:1], s4, s4, 1.0 ; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v1, vcc, 1.0, s4, 1.0 +; GFX6-SLOWFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s2, -1 ; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v2, v0 @@ -2304,13 +2308,14 @@ define amdgpu_kernel void @s_fdiv_f32_denorms_correctly_rounded_divide_sqrt(ptr ; GFX6-SLOWFMA-NEXT: v_fma_f32 v0, -v0, v3, v1 ; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v0, v0, v2, v3 ; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v0, s4, 1.0 +; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-SLOWFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-SLOWFMA-NEXT: s_endpgm ; ; GFX7-LABEL: s_fdiv_f32_denorms_correctly_rounded_divide_sqrt: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s6, s[0:1], 0xb -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7-NEXT: s_load_dword s6, s[2:3], 0xb +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2330,11 +2335,11 @@ define amdgpu_kernel void @s_fdiv_f32_denorms_correctly_rounded_divide_sqrt(ptr ; ; GFX8-LABEL: s_fdiv_f32_denorms_correctly_rounded_divide_sqrt: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_div_scale_f32 v0, s[2:3], s4, s4, 1.0 +; GFX8-NEXT: v_div_scale_f32 v0, s[0:1], s4, s4, 1.0 ; GFX8-NEXT: v_div_scale_f32 v1, vcc, 1.0, s4, 1.0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_rcp_f32_e32 v2, v0 ; GFX8-NEXT: v_fma_f32 v3, -v0, v2, 1.0 ; GFX8-NEXT: v_fma_f32 v2, v3, v2, v2 @@ -2344,6 +2349,7 @@ define amdgpu_kernel void @s_fdiv_f32_denorms_correctly_rounded_divide_sqrt(ptr ; GFX8-NEXT: v_fma_f32 v0, -v0, v3, v1 ; GFX8-NEXT: v_div_fmas_f32 v0, v0, v2, v3 ; GFX8-NEXT: v_div_fixup_f32 v2, v0, s4, 1.0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -2351,21 +2357,21 @@ define amdgpu_kernel void @s_fdiv_f32_denorms_correctly_rounded_divide_sqrt(ptr ; ; GFX10-LABEL: s_fdiv_f32_denorms_correctly_rounded_divide_sqrt: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s3, s2, s2, 1.0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_div_scale_f32 v0, s0, s4, s4, 1.0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_rcp_f32_e32 v1, v0 ; GFX10-NEXT: v_fma_f32 v2, -v0, v1, 1.0 ; GFX10-NEXT: v_fmac_f32_e32 v1, v2, v1 -; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s2, 1.0 +; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s4, 1.0 ; GFX10-NEXT: v_mul_f32_e32 v3, v2, v1 ; GFX10-NEXT: v_fma_f32 v4, -v0, v3, v2 ; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v1 ; GFX10-NEXT: v_fma_f32 v0, -v0, v3, v2 ; GFX10-NEXT: v_div_fmas_f32 v0, v0, v1, v3 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_div_fixup_f32 v0, v0, s2, 1.0 +; GFX10-NEXT: v_div_fixup_f32 v0, v0, s4, 1.0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm @@ -2373,22 +2379,22 @@ define amdgpu_kernel void @s_fdiv_f32_denorms_correctly_rounded_divide_sqrt(ptr ; GFX11-LABEL: s_fdiv_f32_denorms_correctly_rounded_divide_sqrt: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v0, null, s2, s2, 1.0 +; GFX11-NEXT: v_div_scale_f32 v0, null, s4, s4, 1.0 ; GFX11-NEXT: v_rcp_f32_e32 v1, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_fma_f32 v2, -v0, v1, 1.0 ; GFX11-NEXT: v_fmac_f32_e32 v1, v2, v1 -; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s2, 1.0 +; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s4, 1.0 ; GFX11-NEXT: v_mul_f32_e32 v3, v2, v1 ; GFX11-NEXT: v_fma_f32 v4, -v0, v3, v2 ; GFX11-NEXT: v_fmac_f32_e32 v3, v4, v1 ; GFX11-NEXT: v_fma_f32 v0, -v0, v3, v2 ; GFX11-NEXT: v_div_fmas_f32 v0, v0, v1, v3 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: v_div_fixup_f32 v0, v0, s2, 1.0 +; GFX11-NEXT: v_div_fixup_f32 v0, v0, s4, 1.0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll index 26714dcc6dfac..8510e26a3eafb 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll @@ -6,14 +6,14 @@ define amdgpu_kernel void @atomic_add_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_add_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s2, 16 -; GCN1-NEXT: s_addc_u32 s1, s3, 0 +; GCN1-NEXT: s_add_u32 s0, s0, 16 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_add v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -21,14 +21,14 @@ define amdgpu_kernel void @atomic_add_i32_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_add_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s2, 16 -; GCN2-NEXT: s_addc_u32 s1, s3, 0 +; GCN2-NEXT: s_add_u32 s0, s0, 16 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_add v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -36,11 +36,11 @@ define amdgpu_kernel void @atomic_add_i32_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_add_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_add v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -55,14 +55,14 @@ entry: define amdgpu_kernel void @atomic_add_i32_max_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_add_i32_max_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s2, 0xffc -; GCN1-NEXT: s_addc_u32 s1, s3, 0 +; GCN1-NEXT: s_add_u32 s0, s0, 0xffc +; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_add v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -70,14 +70,14 @@ define amdgpu_kernel void @atomic_add_i32_max_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_add_i32_max_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s2, 0xffc -; GCN2-NEXT: s_addc_u32 s1, s3, 0 +; GCN2-NEXT: s_add_u32 s0, s0, 0xffc +; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_add v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -85,11 +85,11 @@ define amdgpu_kernel void @atomic_add_i32_max_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_add_i32_max_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_add v[0:1], v2 offset:4092 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -104,14 +104,14 @@ entry: define amdgpu_kernel void @atomic_add_i32_max_offset_p1(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_add_i32_max_offset_p1: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s2, 0x1000 -; GCN1-NEXT: s_addc_u32 s1, s3, 0 +; GCN1-NEXT: s_add_u32 s0, s0, 0x1000 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_add v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -119,14 +119,14 @@ define amdgpu_kernel void @atomic_add_i32_max_offset_p1(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_add_i32_max_offset_p1: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s2, 0x1000 -; GCN2-NEXT: s_addc_u32 s1, s3, 0 +; GCN2-NEXT: s_add_u32 s0, s0, 0x1000 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_add v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -134,11 +134,11 @@ define amdgpu_kernel void @atomic_add_i32_max_offset_p1(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_add_i32_max_offset_p1: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: v_mov_b32_e32 v2, s4 @@ -155,8 +155,8 @@ entry: define amdgpu_kernel void @atomic_add_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_add_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, 16 ; GCN1-NEXT: s_addc_u32 s1, s5, 0 @@ -173,8 +173,8 @@ define amdgpu_kernel void @atomic_add_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_add_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, 16 ; GCN2-NEXT: s_addc_u32 s1, s5, 0 @@ -191,12 +191,12 @@ define amdgpu_kernel void @atomic_add_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_add_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: flat_atomic_add v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -214,18 +214,18 @@ entry: define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_add_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_add v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -233,18 +233,18 @@ define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr %out, i32 %in, i64 % ; ; GCN2-LABEL: atomic_add_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_add v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -252,11 +252,11 @@ define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr %out, i32 %in, i64 % ; ; GCN3-LABEL: atomic_add_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -276,18 +276,18 @@ entry: define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_add_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_add v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -298,18 +298,18 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_add_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_add v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -320,11 +320,11 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN3-LABEL: atomic_add_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -348,12 +348,12 @@ entry: define amdgpu_kernel void @atomic_add_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_add_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_add v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -361,12 +361,12 @@ define amdgpu_kernel void @atomic_add_i32(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_add_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_add v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -374,11 +374,11 @@ define amdgpu_kernel void @atomic_add_i32(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_add_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_add v[0:1], v2 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -392,8 +392,8 @@ entry: define amdgpu_kernel void @atomic_add_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_add_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -408,8 +408,8 @@ define amdgpu_kernel void @atomic_add_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN2-LABEL: atomic_add_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -424,12 +424,12 @@ define amdgpu_kernel void @atomic_add_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN3-LABEL: atomic_add_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: flat_atomic_add v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -446,16 +446,16 @@ entry: define amdgpu_kernel void @atomic_add_i32_addr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_add_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_add v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -463,16 +463,16 @@ define amdgpu_kernel void @atomic_add_i32_addr64(ptr %out, i32 %in, i64 %index) ; ; GCN2-LABEL: atomic_add_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_add v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -480,11 +480,11 @@ define amdgpu_kernel void @atomic_add_i32_addr64(ptr %out, i32 %in, i64 %index) ; ; GCN3-LABEL: atomic_add_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -503,16 +503,16 @@ entry: define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_add_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_add v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -523,16 +523,16 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_add_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_add v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -543,11 +543,11 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_add_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -570,14 +570,14 @@ entry: define amdgpu_kernel void @atomic_and_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_and_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s2, 16 -; GCN1-NEXT: s_addc_u32 s1, s3, 0 +; GCN1-NEXT: s_add_u32 s0, s0, 16 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_and v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -585,14 +585,14 @@ define amdgpu_kernel void @atomic_and_i32_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_and_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s2, 16 -; GCN2-NEXT: s_addc_u32 s1, s3, 0 +; GCN2-NEXT: s_add_u32 s0, s0, 16 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_and v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -600,11 +600,11 @@ define amdgpu_kernel void @atomic_and_i32_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_and_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_and v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -619,8 +619,8 @@ entry: define amdgpu_kernel void @atomic_and_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_and_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, 16 ; GCN1-NEXT: s_addc_u32 s1, s5, 0 @@ -637,8 +637,8 @@ define amdgpu_kernel void @atomic_and_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_and_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, 16 ; GCN2-NEXT: s_addc_u32 s1, s5, 0 @@ -655,12 +655,12 @@ define amdgpu_kernel void @atomic_and_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_and_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: flat_atomic_and v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -678,18 +678,18 @@ entry: define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_and_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_and v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -697,18 +697,18 @@ define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr %out, i32 %in, i64 % ; ; GCN2-LABEL: atomic_and_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_and v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -716,11 +716,11 @@ define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr %out, i32 %in, i64 % ; ; GCN3-LABEL: atomic_and_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -740,18 +740,18 @@ entry: define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_and_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_and v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -762,18 +762,18 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_and_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_and v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -784,11 +784,11 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN3-LABEL: atomic_and_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -812,12 +812,12 @@ entry: define amdgpu_kernel void @atomic_and_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_and_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_and v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -825,12 +825,12 @@ define amdgpu_kernel void @atomic_and_i32(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_and_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_and v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -838,11 +838,11 @@ define amdgpu_kernel void @atomic_and_i32(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_and_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_and v[0:1], v2 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -856,8 +856,8 @@ entry: define amdgpu_kernel void @atomic_and_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_and_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -872,8 +872,8 @@ define amdgpu_kernel void @atomic_and_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN2-LABEL: atomic_and_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -888,12 +888,12 @@ define amdgpu_kernel void @atomic_and_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN3-LABEL: atomic_and_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: flat_atomic_and v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -910,16 +910,16 @@ entry: define amdgpu_kernel void @atomic_and_i32_addr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_and_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_and v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -927,16 +927,16 @@ define amdgpu_kernel void @atomic_and_i32_addr64(ptr %out, i32 %in, i64 %index) ; ; GCN2-LABEL: atomic_and_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_and v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -944,11 +944,11 @@ define amdgpu_kernel void @atomic_and_i32_addr64(ptr %out, i32 %in, i64 %index) ; ; GCN3-LABEL: atomic_and_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -967,16 +967,16 @@ entry: define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_and_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_and v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -987,16 +987,16 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_and_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_and v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -1007,11 +1007,11 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_and_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -1034,14 +1034,14 @@ entry: define amdgpu_kernel void @atomic_sub_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_sub_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s2, 16 -; GCN1-NEXT: s_addc_u32 s1, s3, 0 +; GCN1-NEXT: s_add_u32 s0, s0, 16 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_sub v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -1049,14 +1049,14 @@ define amdgpu_kernel void @atomic_sub_i32_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_sub_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s2, 16 -; GCN2-NEXT: s_addc_u32 s1, s3, 0 +; GCN2-NEXT: s_add_u32 s0, s0, 16 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_sub v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -1064,11 +1064,11 @@ define amdgpu_kernel void @atomic_sub_i32_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_sub_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_sub v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1083,8 +1083,8 @@ entry: define amdgpu_kernel void @atomic_sub_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_sub_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, 16 ; GCN1-NEXT: s_addc_u32 s1, s5, 0 @@ -1101,8 +1101,8 @@ define amdgpu_kernel void @atomic_sub_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_sub_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, 16 ; GCN2-NEXT: s_addc_u32 s1, s5, 0 @@ -1119,12 +1119,12 @@ define amdgpu_kernel void @atomic_sub_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_sub_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: flat_atomic_sub v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -1142,18 +1142,18 @@ entry: define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_sub_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_sub v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -1161,18 +1161,18 @@ define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr %out, i32 %in, i64 % ; ; GCN2-LABEL: atomic_sub_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_sub v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -1180,11 +1180,11 @@ define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr %out, i32 %in, i64 % ; ; GCN3-LABEL: atomic_sub_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -1204,18 +1204,18 @@ entry: define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_sub_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_sub v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -1226,18 +1226,18 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_sub_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_sub v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -1248,11 +1248,11 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN3-LABEL: atomic_sub_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -1276,12 +1276,12 @@ entry: define amdgpu_kernel void @atomic_sub_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_sub_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_sub v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -1289,12 +1289,12 @@ define amdgpu_kernel void @atomic_sub_i32(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_sub_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_sub v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -1302,11 +1302,11 @@ define amdgpu_kernel void @atomic_sub_i32(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_sub_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_sub v[0:1], v2 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1320,8 +1320,8 @@ entry: define amdgpu_kernel void @atomic_sub_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_sub_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -1336,8 +1336,8 @@ define amdgpu_kernel void @atomic_sub_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN2-LABEL: atomic_sub_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -1352,12 +1352,12 @@ define amdgpu_kernel void @atomic_sub_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN3-LABEL: atomic_sub_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: flat_atomic_sub v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -1374,16 +1374,16 @@ entry: define amdgpu_kernel void @atomic_sub_i32_addr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_sub_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_sub v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -1391,16 +1391,16 @@ define amdgpu_kernel void @atomic_sub_i32_addr64(ptr %out, i32 %in, i64 %index) ; ; GCN2-LABEL: atomic_sub_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_sub v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -1408,11 +1408,11 @@ define amdgpu_kernel void @atomic_sub_i32_addr64(ptr %out, i32 %in, i64 %index) ; ; GCN3-LABEL: atomic_sub_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -1431,16 +1431,16 @@ entry: define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_sub_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_sub v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -1451,16 +1451,16 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_sub_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_sub v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -1471,11 +1471,11 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_sub_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -1498,39 +1498,39 @@ entry: define amdgpu_kernel void @atomic_max_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_max_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s2, 16 -; GCN1-NEXT: s_addc_u32 s1, s3, 0 +; GCN1-NEXT: s_add_u32 s0, s0, 16 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_smax v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_max_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s2, 16 -; GCN2-NEXT: s_addc_u32 s1, s3, 0 +; GCN2-NEXT: s_add_u32 s0, s0, 16 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_smax v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_max_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_smax v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) @@ -1544,8 +1544,8 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_max_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, 16 ; GCN1-NEXT: s_addc_u32 s1, s5, 0 @@ -1562,8 +1562,8 @@ define amdgpu_kernel void @atomic_max_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_max_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, 16 ; GCN2-NEXT: s_addc_u32 s1, s5, 0 @@ -1580,12 +1580,12 @@ define amdgpu_kernel void @atomic_max_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_max_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: flat_atomic_smax v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s6 @@ -1603,47 +1603,47 @@ entry: define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_max_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_smax v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_max_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_smax v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_max_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -1662,18 +1662,18 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_max_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_smax v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 @@ -1684,18 +1684,18 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_max_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_smax v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 @@ -1706,11 +1706,11 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN3-LABEL: atomic_max_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -1734,35 +1734,35 @@ entry: define amdgpu_kernel void @atomic_max_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_max_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_smax v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_max_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_smax v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_max_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_smax v[0:1], v2 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) @@ -1775,8 +1775,8 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_max_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -1791,8 +1791,8 @@ define amdgpu_kernel void @atomic_max_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN2-LABEL: atomic_max_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -1807,12 +1807,12 @@ define amdgpu_kernel void @atomic_max_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN3-LABEL: atomic_max_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: flat_atomic_smax v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s6 @@ -1829,43 +1829,43 @@ entry: define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_max_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_smax v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_max_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_smax v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_max_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -1883,16 +1883,16 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_max_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_smax v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 @@ -1903,16 +1903,16 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_max_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_smax v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 @@ -1923,11 +1923,11 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_max_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -1950,39 +1950,39 @@ entry: define amdgpu_kernel void @atomic_umax_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_umax_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s2, 16 -; GCN1-NEXT: s_addc_u32 s1, s3, 0 +; GCN1-NEXT: s_add_u32 s0, s0, 16 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_umax v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umax_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s2, 16 -; GCN2-NEXT: s_addc_u32 s1, s3, 0 +; GCN2-NEXT: s_add_u32 s0, s0, 16 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_umax v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_umax_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_umax v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) @@ -1996,8 +1996,8 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_umax_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, 16 ; GCN1-NEXT: s_addc_u32 s1, s5, 0 @@ -2014,8 +2014,8 @@ define amdgpu_kernel void @atomic_umax_i32_ret_offset(ptr %out, ptr %out2, i32 % ; ; GCN2-LABEL: atomic_umax_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, 16 ; GCN2-NEXT: s_addc_u32 s1, s5, 0 @@ -2032,12 +2032,12 @@ define amdgpu_kernel void @atomic_umax_i32_ret_offset(ptr %out, ptr %out2, i32 % ; ; GCN3-LABEL: atomic_umax_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: flat_atomic_umax v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s6 @@ -2055,47 +2055,47 @@ entry: define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_umax_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_umax v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umax_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_umax v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_umax_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -2114,18 +2114,18 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_umax_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_umax v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 @@ -2136,18 +2136,18 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2 ; ; GCN2-LABEL: atomic_umax_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_umax v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 @@ -2158,11 +2158,11 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2 ; ; GCN3-LABEL: atomic_umax_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -2186,35 +2186,35 @@ entry: define amdgpu_kernel void @atomic_umax_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_umax_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_umax v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umax_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_umax v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_umax_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_umax v[0:1], v2 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) @@ -2227,8 +2227,8 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_umax_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -2243,8 +2243,8 @@ define amdgpu_kernel void @atomic_umax_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN2-LABEL: atomic_umax_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -2259,12 +2259,12 @@ define amdgpu_kernel void @atomic_umax_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN3-LABEL: atomic_umax_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: flat_atomic_umax v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s6 @@ -2281,43 +2281,43 @@ entry: define amdgpu_kernel void @atomic_umax_i32_addr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_umax_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_umax v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umax_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_umax v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_umax_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -2335,16 +2335,16 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_umax_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_umax v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 @@ -2355,16 +2355,16 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; ; GCN2-LABEL: atomic_umax_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_umax v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 @@ -2375,11 +2375,11 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; ; GCN3-LABEL: atomic_umax_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -2402,39 +2402,39 @@ entry: define amdgpu_kernel void @atomic_min_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_min_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s2, 16 -; GCN1-NEXT: s_addc_u32 s1, s3, 0 +; GCN1-NEXT: s_add_u32 s0, s0, 16 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_smin v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s2, 16 -; GCN2-NEXT: s_addc_u32 s1, s3, 0 +; GCN2-NEXT: s_add_u32 s0, s0, 16 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_smin v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_min_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_smin v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) @@ -2448,8 +2448,8 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_min_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, 16 ; GCN1-NEXT: s_addc_u32 s1, s5, 0 @@ -2466,8 +2466,8 @@ define amdgpu_kernel void @atomic_min_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_min_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, 16 ; GCN2-NEXT: s_addc_u32 s1, s5, 0 @@ -2484,12 +2484,12 @@ define amdgpu_kernel void @atomic_min_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_min_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: flat_atomic_smin v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s6 @@ -2507,47 +2507,47 @@ entry: define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_min_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_smin v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_smin v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_min_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -2566,18 +2566,18 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_min_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_smin v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 @@ -2588,18 +2588,18 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_min_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_smin v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 @@ -2610,11 +2610,11 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN3-LABEL: atomic_min_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -2638,35 +2638,35 @@ entry: define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_min_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_smin v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_smin v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_min_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_smin v[0:1], v2 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) @@ -2679,8 +2679,8 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_min_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -2695,8 +2695,8 @@ define amdgpu_kernel void @atomic_min_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN2-LABEL: atomic_min_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -2711,12 +2711,12 @@ define amdgpu_kernel void @atomic_min_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN3-LABEL: atomic_min_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: flat_atomic_smin v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s6 @@ -2733,43 +2733,43 @@ entry: define amdgpu_kernel void @atomic_min_i32_addr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_min_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_smin v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_smin v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_min_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -2787,16 +2787,16 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_min_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_smin v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 @@ -2807,16 +2807,16 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_min_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_smin v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 @@ -2827,11 +2827,11 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_min_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -2854,39 +2854,39 @@ entry: define amdgpu_kernel void @atomic_umin_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_umin_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s2, 16 -; GCN1-NEXT: s_addc_u32 s1, s3, 0 +; GCN1-NEXT: s_add_u32 s0, s0, 16 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_umin v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umin_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s2, 16 -; GCN2-NEXT: s_addc_u32 s1, s3, 0 +; GCN2-NEXT: s_add_u32 s0, s0, 16 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_umin v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_umin_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_umin v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) @@ -2900,8 +2900,8 @@ entry: define amdgpu_kernel void @atomic_umin_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_umin_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, 16 ; GCN1-NEXT: s_addc_u32 s1, s5, 0 @@ -2918,8 +2918,8 @@ define amdgpu_kernel void @atomic_umin_i32_ret_offset(ptr %out, ptr %out2, i32 % ; ; GCN2-LABEL: atomic_umin_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, 16 ; GCN2-NEXT: s_addc_u32 s1, s5, 0 @@ -2936,12 +2936,12 @@ define amdgpu_kernel void @atomic_umin_i32_ret_offset(ptr %out, ptr %out2, i32 % ; ; GCN3-LABEL: atomic_umin_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: flat_atomic_umin v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s6 @@ -2959,47 +2959,47 @@ entry: define amdgpu_kernel void @atomic_umin_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_umin_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_umin v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umin_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_umin v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_umin_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -3018,18 +3018,18 @@ entry: define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_umin_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_umin v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 @@ -3040,18 +3040,18 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr %out, ptr %out2 ; ; GCN2-LABEL: atomic_umin_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_umin v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 @@ -3062,11 +3062,11 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr %out, ptr %out2 ; ; GCN3-LABEL: atomic_umin_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -3090,35 +3090,35 @@ entry: define amdgpu_kernel void @atomic_umin_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_umin_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_umin v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umin_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_umin v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_umin_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_umin v[0:1], v2 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) @@ -3131,8 +3131,8 @@ entry: define amdgpu_kernel void @atomic_umin_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_umin_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -3147,8 +3147,8 @@ define amdgpu_kernel void @atomic_umin_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN2-LABEL: atomic_umin_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -3163,12 +3163,12 @@ define amdgpu_kernel void @atomic_umin_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN3-LABEL: atomic_umin_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: flat_atomic_umin v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s6 @@ -3185,43 +3185,43 @@ entry: define amdgpu_kernel void @atomic_umin_i32_addr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_umin_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_umin v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umin_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_umin v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_umin_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -3239,16 +3239,16 @@ entry: define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_umin_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_umin v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 @@ -3259,16 +3259,16 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; ; GCN2-LABEL: atomic_umin_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_umin v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 @@ -3279,11 +3279,11 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; ; GCN3-LABEL: atomic_umin_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -3306,14 +3306,14 @@ entry: define amdgpu_kernel void @atomic_or_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_or_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s2, 16 -; GCN1-NEXT: s_addc_u32 s1, s3, 0 +; GCN1-NEXT: s_add_u32 s0, s0, 16 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_or v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -3321,14 +3321,14 @@ define amdgpu_kernel void @atomic_or_i32_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_or_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s2, 16 -; GCN2-NEXT: s_addc_u32 s1, s3, 0 +; GCN2-NEXT: s_add_u32 s0, s0, 16 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_or v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -3336,11 +3336,11 @@ define amdgpu_kernel void @atomic_or_i32_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_or_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_or v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3355,8 +3355,8 @@ entry: define amdgpu_kernel void @atomic_or_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_or_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, 16 ; GCN1-NEXT: s_addc_u32 s1, s5, 0 @@ -3373,8 +3373,8 @@ define amdgpu_kernel void @atomic_or_i32_ret_offset(ptr %out, ptr %out2, i32 %in ; ; GCN2-LABEL: atomic_or_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, 16 ; GCN2-NEXT: s_addc_u32 s1, s5, 0 @@ -3391,12 +3391,12 @@ define amdgpu_kernel void @atomic_or_i32_ret_offset(ptr %out, ptr %out2, i32 %in ; ; GCN3-LABEL: atomic_or_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: flat_atomic_or v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -3414,18 +3414,18 @@ entry: define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_or_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_or v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -3433,18 +3433,18 @@ define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr %out, i32 %in, i64 %i ; ; GCN2-LABEL: atomic_or_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_or v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -3452,11 +3452,11 @@ define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr %out, i32 %in, i64 %i ; ; GCN3-LABEL: atomic_or_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -3476,18 +3476,18 @@ entry: define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_or_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_or v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -3498,18 +3498,18 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_or_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_or v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -3520,11 +3520,11 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN3-LABEL: atomic_or_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -3548,12 +3548,12 @@ entry: define amdgpu_kernel void @atomic_or_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_or_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_or v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -3561,12 +3561,12 @@ define amdgpu_kernel void @atomic_or_i32(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_or_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_or v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -3574,11 +3574,11 @@ define amdgpu_kernel void @atomic_or_i32(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_or_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_or v[0:1], v2 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3592,8 +3592,8 @@ entry: define amdgpu_kernel void @atomic_or_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_or_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -3608,8 +3608,8 @@ define amdgpu_kernel void @atomic_or_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN2-LABEL: atomic_or_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -3624,12 +3624,12 @@ define amdgpu_kernel void @atomic_or_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN3-LABEL: atomic_or_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: flat_atomic_or v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -3646,16 +3646,16 @@ entry: define amdgpu_kernel void @atomic_or_i32_addr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_or_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_or v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -3663,16 +3663,16 @@ define amdgpu_kernel void @atomic_or_i32_addr64(ptr %out, i32 %in, i64 %index) { ; ; GCN2-LABEL: atomic_or_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_or v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -3680,11 +3680,11 @@ define amdgpu_kernel void @atomic_or_i32_addr64(ptr %out, i32 %in, i64 %index) { ; ; GCN3-LABEL: atomic_or_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -3703,16 +3703,16 @@ entry: define amdgpu_kernel void @atomic_or_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_or_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_or v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -3723,16 +3723,16 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64(ptr %out, ptr %out2, i32 %in ; ; GCN2-LABEL: atomic_or_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_or v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -3743,11 +3743,11 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64(ptr %out, ptr %out2, i32 %in ; ; GCN3-LABEL: atomic_or_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -3770,14 +3770,14 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_xchg_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s2, 16 -; GCN1-NEXT: s_addc_u32 s1, s3, 0 +; GCN1-NEXT: s_add_u32 s0, s0, 16 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_swap v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -3785,14 +3785,14 @@ define amdgpu_kernel void @atomic_xchg_i32_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_xchg_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s2, 16 -; GCN2-NEXT: s_addc_u32 s1, s3, 0 +; GCN2-NEXT: s_add_u32 s0, s0, 16 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_swap v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -3800,11 +3800,11 @@ define amdgpu_kernel void @atomic_xchg_i32_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_xchg_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_swap v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3819,14 +3819,14 @@ entry: define amdgpu_kernel void @atomic_xchg_f32_offset(ptr %out, float %in) { ; GCN1-LABEL: atomic_xchg_f32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s2, 16 -; GCN1-NEXT: s_addc_u32 s1, s3, 0 +; GCN1-NEXT: s_add_u32 s0, s0, 16 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_swap v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -3834,14 +3834,14 @@ define amdgpu_kernel void @atomic_xchg_f32_offset(ptr %out, float %in) { ; ; GCN2-LABEL: atomic_xchg_f32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s2, 16 -; GCN2-NEXT: s_addc_u32 s1, s3, 0 +; GCN2-NEXT: s_add_u32 s0, s0, 16 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_swap v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -3849,11 +3849,11 @@ define amdgpu_kernel void @atomic_xchg_f32_offset(ptr %out, float %in) { ; ; GCN3-LABEL: atomic_xchg_f32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_swap v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3868,8 +3868,8 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_xchg_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, 16 ; GCN1-NEXT: s_addc_u32 s1, s5, 0 @@ -3886,8 +3886,8 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_offset(ptr %out, ptr %out2, i32 % ; ; GCN2-LABEL: atomic_xchg_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, 16 ; GCN2-NEXT: s_addc_u32 s1, s5, 0 @@ -3904,12 +3904,12 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_offset(ptr %out, ptr %out2, i32 % ; ; GCN3-LABEL: atomic_xchg_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: flat_atomic_swap v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -3927,18 +3927,18 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_xchg_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_swap v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -3946,18 +3946,18 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(ptr %out, i32 %in, i64 ; ; GCN2-LABEL: atomic_xchg_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_swap v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -3965,11 +3965,11 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(ptr %out, i32 %in, i64 ; ; GCN3-LABEL: atomic_xchg_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -3989,18 +3989,18 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_xchg_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -4011,18 +4011,18 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(ptr %out, ptr %out2 ; ; GCN2-LABEL: atomic_xchg_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -4033,11 +4033,11 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(ptr %out, ptr %out2 ; ; GCN3-LABEL: atomic_xchg_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -4061,12 +4061,12 @@ entry: define amdgpu_kernel void @atomic_xchg_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_xchg_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_swap v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -4074,12 +4074,12 @@ define amdgpu_kernel void @atomic_xchg_i32(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_xchg_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_swap v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -4087,11 +4087,11 @@ define amdgpu_kernel void @atomic_xchg_i32(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_xchg_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_swap v[0:1], v2 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4105,8 +4105,8 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_xchg_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -4121,8 +4121,8 @@ define amdgpu_kernel void @atomic_xchg_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN2-LABEL: atomic_xchg_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -4137,12 +4137,12 @@ define amdgpu_kernel void @atomic_xchg_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN3-LABEL: atomic_xchg_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -4159,16 +4159,16 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_addr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_xchg_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_swap v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -4176,16 +4176,16 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64(ptr %out, i32 %in, i64 %index) ; ; GCN2-LABEL: atomic_xchg_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_swap v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -4193,11 +4193,11 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64(ptr %out, i32 %in, i64 %index) ; ; GCN3-LABEL: atomic_xchg_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -4216,16 +4216,16 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_xchg_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -4236,16 +4236,16 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; ; GCN2-LABEL: atomic_xchg_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -4256,11 +4256,11 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; ; GCN3-LABEL: atomic_xchg_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -4285,7 +4285,7 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr %out, i32 %in, i32 %old) { ; GCN1-LABEL: atomic_cmpxchg_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -4300,7 +4300,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr %out, i32 %in, i32 %old ; ; GCN2-LABEL: atomic_cmpxchg_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -4315,7 +4315,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr %out, i32 %in, i32 %old ; ; GCN3-LABEL: atomic_cmpxchg_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v2, s6 @@ -4334,8 +4334,8 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(ptr %out, ptr %out2, i32 %in, i32 %old) { ; GCN1-LABEL: atomic_cmpxchg_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s2, s4, 16 ; GCN1-NEXT: s_addc_u32 s3, s5, 0 @@ -4353,8 +4353,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(ptr %out, ptr %out2, i3 ; ; GCN2-LABEL: atomic_cmpxchg_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s2, s4, 16 ; GCN2-NEXT: s_addc_u32 s3, s5, 0 @@ -4372,13 +4372,13 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(ptr %out, ptr %out2, i3 ; ; GCN3-LABEL: atomic_cmpxchg_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 +; GCN3-NEXT: v_mov_b32_e32 v3, s1 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v3, s3 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -4397,19 +4397,19 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr %out, i32 %in, i64 %index, i32 %old) { ; GCN1-LABEL: atomic_cmpxchg_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s7, s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dword s6, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xf ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v1, s2 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_cmpswap v[2:3], v[0:1] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4418,19 +4418,19 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr %out, i32 %in, i ; ; GCN2-LABEL: atomic_cmpxchg_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s7, s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x3c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v1, s2 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_cmpswap v[2:3], v[0:1] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4439,12 +4439,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr %out, i32 %in, i ; ; GCN3-LABEL: atomic_cmpxchg_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s7, s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s7, s[2:3], 0x3c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 @@ -4465,19 +4465,19 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index, i32 %old) { ; GCN1-LABEL: atomic_cmpxchg_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf -; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s9, s[0:1], 0x11 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf +; GCN1-NEXT: s_load_dword s8, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x11 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: v_mov_b32_e32 v0, s8 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s9 +; GCN1-NEXT: v_mov_b32_e32 v1, s2 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4489,19 +4489,19 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr %out, ptr %o ; ; GCN2-LABEL: atomic_cmpxchg_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s9, s[0:1], 0x44 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN2-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x44 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: v_mov_b32_e32 v0, s8 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s9 +; GCN2-NEXT: v_mov_b32_e32 v1, s2 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4513,12 +4513,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr %out, ptr %o ; ; GCN3-LABEL: atomic_cmpxchg_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s9, s[0:1], 0x44 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s9, s[2:3], 0x44 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: v_mov_b32_e32 v0, s8 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 @@ -4544,7 +4544,7 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32(ptr %out, i32 %in, i32 %old) { ; GCN1-LABEL: atomic_cmpxchg_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v2, s2 @@ -4557,7 +4557,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32(ptr %out, i32 %in, i32 %old) { ; ; GCN2-LABEL: atomic_cmpxchg_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v2, s2 @@ -4570,7 +4570,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32(ptr %out, i32 %in, i32 %old) { ; ; GCN3-LABEL: atomic_cmpxchg_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v2, s6 @@ -4588,8 +4588,8 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_ret(ptr %out, ptr %out2, i32 %in, i32 %old) { ; GCN1-LABEL: atomic_cmpxchg_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 @@ -4605,8 +4605,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret(ptr %out, ptr %out2, i32 %in, ; ; GCN2-LABEL: atomic_cmpxchg_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 @@ -4622,13 +4622,13 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret(ptr %out, ptr %out2, i32 %in, ; ; GCN3-LABEL: atomic_cmpxchg_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 +; GCN3-NEXT: v_mov_b32_e32 v3, s1 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v3, s3 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -4646,17 +4646,17 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr %out, i32 %in, i64 %index, i32 %old) { ; GCN1-LABEL: atomic_cmpxchg_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s7, s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dword s6, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xf ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v1, s2 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_cmpswap v[2:3], v[0:1] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4665,17 +4665,17 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr %out, i32 %in, i64 %ind ; ; GCN2-LABEL: atomic_cmpxchg_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s7, s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x3c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v1, s2 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_cmpswap v[2:3], v[0:1] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4684,12 +4684,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr %out, i32 %in, i64 %ind ; ; GCN3-LABEL: atomic_cmpxchg_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s7, s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s7, s[2:3], 0x3c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 @@ -4709,17 +4709,17 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index, i32 %old) { ; GCN1-LABEL: atomic_cmpxchg_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf -; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s9, s[0:1], 0x11 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf +; GCN1-NEXT: s_load_dword s8, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x11 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: v_mov_b32_e32 v0, s8 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s9 +; GCN1-NEXT: v_mov_b32_e32 v1, s2 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4731,17 +4731,17 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr %out, ptr %out2, i3 ; ; GCN2-LABEL: atomic_cmpxchg_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s9, s[0:1], 0x44 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN2-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x44 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: v_mov_b32_e32 v0, s8 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s9 +; GCN2-NEXT: v_mov_b32_e32 v1, s2 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4753,12 +4753,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr %out, ptr %out2, i3 ; ; GCN3-LABEL: atomic_cmpxchg_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s9, s[0:1], 0x44 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s9, s[2:3], 0x44 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: v_mov_b32_e32 v0, s8 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 @@ -4783,14 +4783,14 @@ entry: define amdgpu_kernel void @atomic_xor_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_xor_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s2, 16 -; GCN1-NEXT: s_addc_u32 s1, s3, 0 +; GCN1-NEXT: s_add_u32 s0, s0, 16 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_xor v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -4798,14 +4798,14 @@ define amdgpu_kernel void @atomic_xor_i32_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_xor_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s2, 16 -; GCN2-NEXT: s_addc_u32 s1, s3, 0 +; GCN2-NEXT: s_add_u32 s0, s0, 16 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_xor v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -4813,11 +4813,11 @@ define amdgpu_kernel void @atomic_xor_i32_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_xor_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_xor v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4832,8 +4832,8 @@ entry: define amdgpu_kernel void @atomic_xor_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_xor_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, 16 ; GCN1-NEXT: s_addc_u32 s1, s5, 0 @@ -4850,8 +4850,8 @@ define amdgpu_kernel void @atomic_xor_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_xor_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, 16 ; GCN2-NEXT: s_addc_u32 s1, s5, 0 @@ -4868,12 +4868,12 @@ define amdgpu_kernel void @atomic_xor_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_xor_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: flat_atomic_xor v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -4891,18 +4891,18 @@ entry: define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_xor_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_xor v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -4910,18 +4910,18 @@ define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr %out, i32 %in, i64 % ; ; GCN2-LABEL: atomic_xor_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_xor v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -4929,11 +4929,11 @@ define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr %out, i32 %in, i64 % ; ; GCN3-LABEL: atomic_xor_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -4953,18 +4953,18 @@ entry: define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_xor_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_xor v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -4975,18 +4975,18 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_xor_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_xor v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -4997,11 +4997,11 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN3-LABEL: atomic_xor_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -5025,12 +5025,12 @@ entry: define amdgpu_kernel void @atomic_xor_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_xor_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_xor v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -5038,12 +5038,12 @@ define amdgpu_kernel void @atomic_xor_i32(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_xor_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_xor v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -5051,11 +5051,11 @@ define amdgpu_kernel void @atomic_xor_i32(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_xor_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_xor v[0:1], v2 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5069,8 +5069,8 @@ entry: define amdgpu_kernel void @atomic_xor_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_xor_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -5085,8 +5085,8 @@ define amdgpu_kernel void @atomic_xor_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN2-LABEL: atomic_xor_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -5101,12 +5101,12 @@ define amdgpu_kernel void @atomic_xor_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN3-LABEL: atomic_xor_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: flat_atomic_xor v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -5123,16 +5123,16 @@ entry: define amdgpu_kernel void @atomic_xor_i32_addr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_xor_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_xor v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -5140,16 +5140,16 @@ define amdgpu_kernel void @atomic_xor_i32_addr64(ptr %out, i32 %in, i64 %index) ; ; GCN2-LABEL: atomic_xor_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_xor v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -5157,11 +5157,11 @@ define amdgpu_kernel void @atomic_xor_i32_addr64(ptr %out, i32 %in, i64 %index) ; ; GCN3-LABEL: atomic_xor_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -5180,16 +5180,16 @@ entry: define amdgpu_kernel void @atomic_xor_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_xor_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_xor v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -5200,16 +5200,16 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_xor_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_xor v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -5220,11 +5220,11 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_xor_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -5247,7 +5247,7 @@ entry: define amdgpu_kernel void @atomic_load_i32_offset(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -5263,7 +5263,7 @@ define amdgpu_kernel void @atomic_load_i32_offset(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -5279,7 +5279,7 @@ define amdgpu_kernel void @atomic_load_i32_offset(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 @@ -5300,7 +5300,7 @@ entry: define amdgpu_kernel void @atomic_load_i32(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -5314,7 +5314,7 @@ define amdgpu_kernel void @atomic_load_i32(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -5328,7 +5328,7 @@ define amdgpu_kernel void @atomic_load_i32(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 @@ -5348,8 +5348,8 @@ entry: define amdgpu_kernel void @atomic_load_i32_addr64_offset(ptr %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_load_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 ; GCN1-NEXT: s_add_u32 s0, s0, s4 @@ -5368,8 +5368,8 @@ define amdgpu_kernel void @atomic_load_i32_addr64_offset(ptr %in, ptr %out, i64 ; ; GCN2-LABEL: atomic_load_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 ; GCN2-NEXT: s_add_u32 s0, s0, s4 @@ -5388,10 +5388,10 @@ define amdgpu_kernel void @atomic_load_i32_addr64_offset(ptr %in, ptr %out, i64 ; ; GCN3-LABEL: atomic_load_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -5414,8 +5414,8 @@ entry: define amdgpu_kernel void @atomic_load_i32_addr64(ptr %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_load_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 ; GCN1-NEXT: s_add_u32 s0, s0, s4 @@ -5432,8 +5432,8 @@ define amdgpu_kernel void @atomic_load_i32_addr64(ptr %in, ptr %out, i64 %index) ; ; GCN2-LABEL: atomic_load_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 ; GCN2-NEXT: s_add_u32 s0, s0, s4 @@ -5450,10 +5450,10 @@ define amdgpu_kernel void @atomic_load_i32_addr64(ptr %in, ptr %out, i64 %index) ; ; GCN3-LABEL: atomic_load_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -5475,37 +5475,37 @@ entry: define amdgpu_kernel void @atomic_store_i32_offset(i32 %in, ptr %out) { ; GCN1-LABEL: atomic_store_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; GCN1-NEXT: s_load_dword s4, s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s2, 16 -; GCN1-NEXT: s_addc_u32 s1, s3, 0 +; GCN1-NEXT: s_add_u32 s0, s0, 16 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GCN2-NEXT: s_load_dword s4, s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s2, 16 -; GCN2-NEXT: s_addc_u32 s1, s3, 0 +; GCN2-NEXT: s_add_u32 s0, s0, 16 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_store_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_store_dword v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm @@ -5518,33 +5518,33 @@ entry: define amdgpu_kernel void @atomic_store_i32(i32 %in, ptr %out) { ; GCN1-LABEL: atomic_store_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; GCN1-NEXT: s_load_dword s0, s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GCN2-NEXT: s_load_dword s0, s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_store_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm @@ -5556,8 +5556,8 @@ entry: define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_store_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; GCN1-NEXT: s_load_dword s2, s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 @@ -5572,8 +5572,8 @@ define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, ptr %out, i64 ; ; GCN2-LABEL: atomic_store_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GCN2-NEXT: s_load_dword s2, s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 @@ -5588,15 +5588,15 @@ define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, ptr %out, i64 ; ; GCN3-LABEL: atomic_store_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GCN3-NEXT: s_load_dword s2, s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GCN3-NEXT: s_load_dword s8, s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s8 ; GCN3-NEXT: flat_store_dword v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm entry: @@ -5609,8 +5609,8 @@ entry: define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_store_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; GCN1-NEXT: s_load_dword s2, s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 @@ -5623,8 +5623,8 @@ define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, ptr %out, i64 %index ; ; GCN2-LABEL: atomic_store_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GCN2-NEXT: s_load_dword s2, s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 @@ -5637,15 +5637,15 @@ define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, ptr %out, i64 %index ; ; GCN3-LABEL: atomic_store_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GCN3-NEXT: s_load_dword s2, s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GCN3-NEXT: s_load_dword s8, s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s8 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -5657,7 +5657,7 @@ entry: define amdgpu_kernel void @atomic_load_f32_offset(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_f32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -5673,7 +5673,7 @@ define amdgpu_kernel void @atomic_load_f32_offset(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_f32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -5689,7 +5689,7 @@ define amdgpu_kernel void @atomic_load_f32_offset(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_f32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 @@ -5710,7 +5710,7 @@ entry: define amdgpu_kernel void @atomic_load_f32(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_f32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -5724,7 +5724,7 @@ define amdgpu_kernel void @atomic_load_f32(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_f32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -5738,7 +5738,7 @@ define amdgpu_kernel void @atomic_load_f32(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_f32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 @@ -5758,8 +5758,8 @@ entry: define amdgpu_kernel void @atomic_load_f32_addr64_offset(ptr %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_load_f32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 ; GCN1-NEXT: s_add_u32 s0, s0, s4 @@ -5778,8 +5778,8 @@ define amdgpu_kernel void @atomic_load_f32_addr64_offset(ptr %in, ptr %out, i64 ; ; GCN2-LABEL: atomic_load_f32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 ; GCN2-NEXT: s_add_u32 s0, s0, s4 @@ -5798,10 +5798,10 @@ define amdgpu_kernel void @atomic_load_f32_addr64_offset(ptr %in, ptr %out, i64 ; ; GCN3-LABEL: atomic_load_f32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -5824,8 +5824,8 @@ entry: define amdgpu_kernel void @atomic_load_f32_addr64(ptr %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_load_f32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 ; GCN1-NEXT: s_add_u32 s0, s0, s4 @@ -5842,8 +5842,8 @@ define amdgpu_kernel void @atomic_load_f32_addr64(ptr %in, ptr %out, i64 %index) ; ; GCN2-LABEL: atomic_load_f32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 ; GCN2-NEXT: s_add_u32 s0, s0, s4 @@ -5860,10 +5860,10 @@ define amdgpu_kernel void @atomic_load_f32_addr64(ptr %in, ptr %out, i64 %index) ; ; GCN3-LABEL: atomic_load_f32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -5885,37 +5885,37 @@ entry: define amdgpu_kernel void @atomic_store_f32_offset(float %in, ptr %out) { ; GCN1-LABEL: atomic_store_f32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; GCN1-NEXT: s_load_dword s4, s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s2, 16 -; GCN1-NEXT: s_addc_u32 s1, s3, 0 +; GCN1-NEXT: s_add_u32 s0, s0, 16 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_f32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GCN2-NEXT: s_load_dword s4, s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s2, 16 -; GCN2-NEXT: s_addc_u32 s1, s3, 0 +; GCN2-NEXT: s_add_u32 s0, s0, 16 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_store_f32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_store_dword v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm @@ -5928,33 +5928,33 @@ entry: define amdgpu_kernel void @atomic_store_f32(float %in, ptr %out) { ; GCN1-LABEL: atomic_store_f32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; GCN1-NEXT: s_load_dword s0, s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_f32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GCN2-NEXT: s_load_dword s0, s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_store_f32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm @@ -5966,8 +5966,8 @@ entry: define amdgpu_kernel void @atomic_store_f32_addr64_offset(float %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_store_f32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; GCN1-NEXT: s_load_dword s2, s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 @@ -5982,8 +5982,8 @@ define amdgpu_kernel void @atomic_store_f32_addr64_offset(float %in, ptr %out, i ; ; GCN2-LABEL: atomic_store_f32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GCN2-NEXT: s_load_dword s2, s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 @@ -5998,15 +5998,15 @@ define amdgpu_kernel void @atomic_store_f32_addr64_offset(float %in, ptr %out, i ; ; GCN3-LABEL: atomic_store_f32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GCN3-NEXT: s_load_dword s2, s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GCN3-NEXT: s_load_dword s8, s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s8 ; GCN3-NEXT: flat_store_dword v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm entry: @@ -6019,8 +6019,8 @@ entry: define amdgpu_kernel void @atomic_store_f32_addr64(float %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_store_f32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; GCN1-NEXT: s_load_dword s2, s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 @@ -6033,8 +6033,8 @@ define amdgpu_kernel void @atomic_store_f32_addr64(float %in, ptr %out, i64 %ind ; ; GCN2-LABEL: atomic_store_f32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GCN2-NEXT: s_load_dword s2, s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 @@ -6047,15 +6047,15 @@ define amdgpu_kernel void @atomic_store_f32_addr64(float %in, ptr %out, i64 %ind ; ; GCN3-LABEL: atomic_store_f32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GCN3-NEXT: s_load_dword s2, s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GCN3-NEXT: s_load_dword s8, s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s8 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -6067,7 +6067,7 @@ entry: define amdgpu_kernel void @atomic_load_i8_offset(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_i8_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -6083,7 +6083,7 @@ define amdgpu_kernel void @atomic_load_i8_offset(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_i8_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -6099,7 +6099,7 @@ define amdgpu_kernel void @atomic_load_i8_offset(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_i8_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 @@ -6120,7 +6120,7 @@ entry: define amdgpu_kernel void @atomic_load_i8(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_i8: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -6134,7 +6134,7 @@ define amdgpu_kernel void @atomic_load_i8(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_i8: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -6148,7 +6148,7 @@ define amdgpu_kernel void @atomic_load_i8(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_i8: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 @@ -6168,8 +6168,8 @@ entry: define amdgpu_kernel void @atomic_load_i8_addr64_offset(ptr %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_load_i8_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 @@ -6187,8 +6187,8 @@ define amdgpu_kernel void @atomic_load_i8_addr64_offset(ptr %in, ptr %out, i64 % ; ; GCN2-LABEL: atomic_load_i8_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 @@ -6206,11 +6206,11 @@ define amdgpu_kernel void @atomic_load_i8_addr64_offset(ptr %in, ptr %out, i64 % ; ; GCN3-LABEL: atomic_load_i8_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_add_u32 s0, s4, s2 -; GCN3-NEXT: s_addc_u32 s1, s5, s3 +; GCN3-NEXT: s_add_u32 s0, s4, s0 +; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_ubyte v2, v[0:1] offset:16 glc @@ -6231,37 +6231,37 @@ entry: define amdgpu_kernel void @atomic_store_i8_offset(i8 %in, ptr %out) { ; GCN1-LABEL: atomic_store_i8_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; GCN1-NEXT: s_load_dword s4, s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s2, 16 -; GCN1-NEXT: s_addc_u32 s1, s3, 0 +; GCN1-NEXT: s_add_u32 s0, s0, 16 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_store_byte v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_i8_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GCN2-NEXT: s_load_dword s4, s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s2, 16 -; GCN2-NEXT: s_addc_u32 s1, s3, 0 +; GCN2-NEXT: s_add_u32 s0, s0, 16 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_store_byte v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_store_i8_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_store_byte v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm @@ -6274,33 +6274,33 @@ entry: define amdgpu_kernel void @atomic_store_i8(i8 %in, ptr %out) { ; GCN1-LABEL: atomic_store_i8: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; GCN1-NEXT: s_load_dword s0, s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_store_byte v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_i8: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GCN2-NEXT: s_load_dword s0, s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_store_byte v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_store_i8: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_store_byte v[0:1], v2 ; GCN3-NEXT: s_endpgm @@ -6312,8 +6312,8 @@ entry: define amdgpu_kernel void @atomic_store_i8_addr64_offset(i8 %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_store_i8_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; GCN1-NEXT: s_load_dword s2, s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, s6 ; GCN1-NEXT: s_addc_u32 s1, s5, s7 @@ -6327,8 +6327,8 @@ define amdgpu_kernel void @atomic_store_i8_addr64_offset(i8 %in, ptr %out, i64 % ; ; GCN2-LABEL: atomic_store_i8_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GCN2-NEXT: s_load_dword s2, s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, s6 ; GCN2-NEXT: s_addc_u32 s1, s5, s7 @@ -6342,14 +6342,14 @@ define amdgpu_kernel void @atomic_store_i8_addr64_offset(i8 %in, ptr %out, i64 % ; ; GCN3-LABEL: atomic_store_i8_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GCN3-NEXT: s_load_dword s2, s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GCN3-NEXT: s_load_dword s8, s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_add_u32 s0, s4, s6 ; GCN3-NEXT: s_addc_u32 s1, s5, s7 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s8 ; GCN3-NEXT: flat_store_byte v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm entry: @@ -6362,7 +6362,7 @@ entry: define amdgpu_kernel void @atomic_load_i16_offset(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_i16_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -6378,7 +6378,7 @@ define amdgpu_kernel void @atomic_load_i16_offset(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_i16_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -6394,7 +6394,7 @@ define amdgpu_kernel void @atomic_load_i16_offset(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_i16_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 @@ -6415,7 +6415,7 @@ entry: define amdgpu_kernel void @atomic_load_i16(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_i16: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -6429,7 +6429,7 @@ define amdgpu_kernel void @atomic_load_i16(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_i16: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -6443,7 +6443,7 @@ define amdgpu_kernel void @atomic_load_i16(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_i16: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 @@ -6463,8 +6463,8 @@ entry: define amdgpu_kernel void @atomic_load_i16_addr64_offset(ptr %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_load_i16_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 1 ; GCN1-NEXT: s_add_u32 s0, s0, s4 @@ -6483,8 +6483,8 @@ define amdgpu_kernel void @atomic_load_i16_addr64_offset(ptr %in, ptr %out, i64 ; ; GCN2-LABEL: atomic_load_i16_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 1 ; GCN2-NEXT: s_add_u32 s0, s0, s4 @@ -6503,10 +6503,10 @@ define amdgpu_kernel void @atomic_load_i16_addr64_offset(ptr %in, ptr %out, i64 ; ; GCN3-LABEL: atomic_load_i16_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 1 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -6529,37 +6529,37 @@ entry: define amdgpu_kernel void @atomic_store_i16_offset(i16 %in, ptr %out) { ; GCN1-LABEL: atomic_store_i16_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; GCN1-NEXT: s_load_dword s4, s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s2, 16 -; GCN1-NEXT: s_addc_u32 s1, s3, 0 +; GCN1-NEXT: s_add_u32 s0, s0, 16 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_store_short v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_i16_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GCN2-NEXT: s_load_dword s4, s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s2, 16 -; GCN2-NEXT: s_addc_u32 s1, s3, 0 +; GCN2-NEXT: s_add_u32 s0, s0, 16 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_store_short v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_store_i16_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_store_short v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm @@ -6572,33 +6572,33 @@ entry: define amdgpu_kernel void @atomic_store_i16(i16 %in, ptr %out) { ; GCN1-LABEL: atomic_store_i16: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; GCN1-NEXT: s_load_dword s0, s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_store_short v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_i16: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GCN2-NEXT: s_load_dword s0, s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_store_short v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_store_i16: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_store_short v[0:1], v2 ; GCN3-NEXT: s_endpgm @@ -6610,8 +6610,8 @@ entry: define amdgpu_kernel void @atomic_store_i16_addr64_offset(i16 %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_store_i16_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; GCN1-NEXT: s_load_dword s2, s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[0:1], s[6:7], 1 ; GCN1-NEXT: s_add_u32 s0, s4, s0 @@ -6626,8 +6626,8 @@ define amdgpu_kernel void @atomic_store_i16_addr64_offset(i16 %in, ptr %out, i64 ; ; GCN2-LABEL: atomic_store_i16_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GCN2-NEXT: s_load_dword s2, s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[0:1], s[6:7], 1 ; GCN2-NEXT: s_add_u32 s0, s4, s0 @@ -6642,15 +6642,15 @@ define amdgpu_kernel void @atomic_store_i16_addr64_offset(i16 %in, ptr %out, i64 ; ; GCN3-LABEL: atomic_store_i16_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GCN3-NEXT: s_load_dword s2, s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GCN3-NEXT: s_load_dword s8, s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[0:1], s[6:7], 1 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s8 ; GCN3-NEXT: flat_store_short v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm entry: @@ -6663,37 +6663,37 @@ entry: define amdgpu_kernel void @atomic_store_f16_offset(half %in, ptr %out) { ; GCN1-LABEL: atomic_store_f16_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; GCN1-NEXT: s_load_dword s4, s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s2, 16 -; GCN1-NEXT: s_addc_u32 s1, s3, 0 +; GCN1-NEXT: s_add_u32 s0, s0, 16 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_store_short v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_f16_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GCN2-NEXT: s_load_dword s4, s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s2, 16 -; GCN2-NEXT: s_addc_u32 s1, s3, 0 +; GCN2-NEXT: s_add_u32 s0, s0, 16 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_store_short v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_store_f16_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_store_short v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm @@ -6706,33 +6706,33 @@ entry: define amdgpu_kernel void @atomic_store_f16(half %in, ptr %out) { ; GCN1-LABEL: atomic_store_f16: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; GCN1-NEXT: s_load_dword s0, s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_store_short v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_f16: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GCN2-NEXT: s_load_dword s0, s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_store_short v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_store_f16: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_store_short v[0:1], v2 ; GCN3-NEXT: s_endpgm @@ -6744,33 +6744,33 @@ entry: define amdgpu_kernel void @atomic_store_bf16_offset(bfloat %in, ptr %out) { ; GCN1-LABEL: atomic_store_bf16_offset: ; GCN1: ; %bb.0: -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; GCN1-NEXT: s_load_dword s0, s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_store_short v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_bf16_offset: ; GCN2: ; %bb.0: -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GCN2-NEXT: s_load_dword s0, s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_store_short v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_store_bf16_offset: ; GCN3: ; %bb.0: -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_store_short v[0:1], v2 ; GCN3-NEXT: s_endpgm @@ -6782,33 +6782,33 @@ define amdgpu_kernel void @atomic_store_bf16_offset(bfloat %in, ptr %out) { define amdgpu_kernel void @atomic_store_bf16(bfloat %in, ptr %out) { ; GCN1-LABEL: atomic_store_bf16: ; GCN1: ; %bb.0: -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; GCN1-NEXT: s_load_dword s0, s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_store_short v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_bf16: ; GCN2: ; %bb.0: -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GCN2-NEXT: s_load_dword s0, s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_store_short v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_store_bf16: ; GCN3: ; %bb.0: -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_store_short v[0:1], v2 ; GCN3-NEXT: s_endpgm @@ -6819,14 +6819,14 @@ define amdgpu_kernel void @atomic_store_bf16(bfloat %in, ptr %out) { define amdgpu_kernel void @atomic_inc_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_inc_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s2, 16 -; GCN1-NEXT: s_addc_u32 s1, s3, 0 +; GCN1-NEXT: s_add_u32 s0, s0, 16 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_inc v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -6834,14 +6834,14 @@ define amdgpu_kernel void @atomic_inc_i32_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_inc_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s2, 16 -; GCN2-NEXT: s_addc_u32 s1, s3, 0 +; GCN2-NEXT: s_add_u32 s0, s0, 16 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_inc v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -6849,11 +6849,11 @@ define amdgpu_kernel void @atomic_inc_i32_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_inc_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_inc v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6868,14 +6868,14 @@ entry: define amdgpu_kernel void @atomic_inc_i32_max_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_inc_i32_max_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s2, 0xffc -; GCN1-NEXT: s_addc_u32 s1, s3, 0 +; GCN1-NEXT: s_add_u32 s0, s0, 0xffc +; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_inc v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -6883,14 +6883,14 @@ define amdgpu_kernel void @atomic_inc_i32_max_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_inc_i32_max_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s2, 0xffc -; GCN2-NEXT: s_addc_u32 s1, s3, 0 +; GCN2-NEXT: s_add_u32 s0, s0, 0xffc +; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_inc v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -6898,11 +6898,11 @@ define amdgpu_kernel void @atomic_inc_i32_max_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_inc_i32_max_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_inc v[0:1], v2 offset:4092 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6917,14 +6917,14 @@ entry: define amdgpu_kernel void @atomic_inc_i32_max_offset_p1(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_inc_i32_max_offset_p1: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s2, 0x1000 -; GCN1-NEXT: s_addc_u32 s1, s3, 0 +; GCN1-NEXT: s_add_u32 s0, s0, 0x1000 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_inc v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -6932,14 +6932,14 @@ define amdgpu_kernel void @atomic_inc_i32_max_offset_p1(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_inc_i32_max_offset_p1: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s2, 0x1000 -; GCN2-NEXT: s_addc_u32 s1, s3, 0 +; GCN2-NEXT: s_add_u32 s0, s0, 0x1000 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_inc v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -6947,11 +6947,11 @@ define amdgpu_kernel void @atomic_inc_i32_max_offset_p1(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_inc_i32_max_offset_p1: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: v_mov_b32_e32 v2, s4 @@ -6968,8 +6968,8 @@ entry: define amdgpu_kernel void @atomic_inc_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_inc_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, 16 ; GCN1-NEXT: s_addc_u32 s1, s5, 0 @@ -6986,8 +6986,8 @@ define amdgpu_kernel void @atomic_inc_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_inc_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, 16 ; GCN2-NEXT: s_addc_u32 s1, s5, 0 @@ -7004,12 +7004,12 @@ define amdgpu_kernel void @atomic_inc_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_inc_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: flat_atomic_inc v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -7027,18 +7027,18 @@ entry: define amdgpu_kernel void @atomic_inc_i32_incr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_inc_i32_incr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_inc v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7046,18 +7046,18 @@ define amdgpu_kernel void @atomic_inc_i32_incr64_offset(ptr %out, i32 %in, i64 % ; ; GCN2-LABEL: atomic_inc_i32_incr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_inc v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7065,11 +7065,11 @@ define amdgpu_kernel void @atomic_inc_i32_incr64_offset(ptr %out, i32 %in, i64 % ; ; GCN3-LABEL: atomic_inc_i32_incr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -7089,18 +7089,18 @@ entry: define amdgpu_kernel void @atomic_inc_i32_ret_incr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_inc_i32_ret_incr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_inc v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7111,18 +7111,18 @@ define amdgpu_kernel void @atomic_inc_i32_ret_incr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_inc_i32_ret_incr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_inc v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7133,11 +7133,11 @@ define amdgpu_kernel void @atomic_inc_i32_ret_incr64_offset(ptr %out, ptr %out2, ; ; GCN3-LABEL: atomic_inc_i32_ret_incr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -7161,12 +7161,12 @@ entry: define amdgpu_kernel void @atomic_inc_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_inc_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_inc v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7174,12 +7174,12 @@ define amdgpu_kernel void @atomic_inc_i32(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_inc_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_inc v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7187,11 +7187,11 @@ define amdgpu_kernel void @atomic_inc_i32(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_inc_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_inc v[0:1], v2 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7205,8 +7205,8 @@ entry: define amdgpu_kernel void @atomic_inc_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_inc_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -7221,8 +7221,8 @@ define amdgpu_kernel void @atomic_inc_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN2-LABEL: atomic_inc_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -7237,12 +7237,12 @@ define amdgpu_kernel void @atomic_inc_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN3-LABEL: atomic_inc_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: flat_atomic_inc v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -7259,16 +7259,16 @@ entry: define amdgpu_kernel void @atomic_inc_i32_incr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_inc_i32_incr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_inc v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7276,16 +7276,16 @@ define amdgpu_kernel void @atomic_inc_i32_incr64(ptr %out, i32 %in, i64 %index) ; ; GCN2-LABEL: atomic_inc_i32_incr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_inc v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7293,11 +7293,11 @@ define amdgpu_kernel void @atomic_inc_i32_incr64(ptr %out, i32 %in, i64 %index) ; ; GCN3-LABEL: atomic_inc_i32_incr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -7316,16 +7316,16 @@ entry: define amdgpu_kernel void @atomic_inc_i32_ret_incr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_inc_i32_ret_incr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_inc v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7336,16 +7336,16 @@ define amdgpu_kernel void @atomic_inc_i32_ret_incr64(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_inc_i32_ret_incr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_inc v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7356,11 +7356,11 @@ define amdgpu_kernel void @atomic_inc_i32_ret_incr64(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_inc_i32_ret_incr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -7383,14 +7383,14 @@ entry: define amdgpu_kernel void @atomic_dec_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_dec_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s2, 16 -; GCN1-NEXT: s_addc_u32 s1, s3, 0 +; GCN1-NEXT: s_add_u32 s0, s0, 16 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_dec v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7398,14 +7398,14 @@ define amdgpu_kernel void @atomic_dec_i32_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_dec_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s2, 16 -; GCN2-NEXT: s_addc_u32 s1, s3, 0 +; GCN2-NEXT: s_add_u32 s0, s0, 16 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_dec v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7413,11 +7413,11 @@ define amdgpu_kernel void @atomic_dec_i32_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_dec_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_dec v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7432,14 +7432,14 @@ entry: define amdgpu_kernel void @atomic_dec_i32_max_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_dec_i32_max_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s2, 0xffc -; GCN1-NEXT: s_addc_u32 s1, s3, 0 +; GCN1-NEXT: s_add_u32 s0, s0, 0xffc +; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_dec v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7447,14 +7447,14 @@ define amdgpu_kernel void @atomic_dec_i32_max_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_dec_i32_max_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s2, 0xffc -; GCN2-NEXT: s_addc_u32 s1, s3, 0 +; GCN2-NEXT: s_add_u32 s0, s0, 0xffc +; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_dec v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7462,11 +7462,11 @@ define amdgpu_kernel void @atomic_dec_i32_max_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_dec_i32_max_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_dec v[0:1], v2 offset:4092 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7481,14 +7481,14 @@ entry: define amdgpu_kernel void @atomic_dec_i32_max_offset_p1(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_dec_i32_max_offset_p1: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s2, 0x1000 -; GCN1-NEXT: s_addc_u32 s1, s3, 0 +; GCN1-NEXT: s_add_u32 s0, s0, 0x1000 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_dec v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7496,14 +7496,14 @@ define amdgpu_kernel void @atomic_dec_i32_max_offset_p1(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_dec_i32_max_offset_p1: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s2, 0x1000 -; GCN2-NEXT: s_addc_u32 s1, s3, 0 +; GCN2-NEXT: s_add_u32 s0, s0, 0x1000 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_dec v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7511,11 +7511,11 @@ define amdgpu_kernel void @atomic_dec_i32_max_offset_p1(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_dec_i32_max_offset_p1: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: v_mov_b32_e32 v2, s4 @@ -7532,8 +7532,8 @@ entry: define amdgpu_kernel void @atomic_dec_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_dec_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, 16 ; GCN1-NEXT: s_addc_u32 s1, s5, 0 @@ -7550,8 +7550,8 @@ define amdgpu_kernel void @atomic_dec_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_dec_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, 16 ; GCN2-NEXT: s_addc_u32 s1, s5, 0 @@ -7568,12 +7568,12 @@ define amdgpu_kernel void @atomic_dec_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_dec_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: flat_atomic_dec v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -7591,18 +7591,18 @@ entry: define amdgpu_kernel void @atomic_dec_i32_decr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_dec_i32_decr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_dec v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7610,18 +7610,18 @@ define amdgpu_kernel void @atomic_dec_i32_decr64_offset(ptr %out, i32 %in, i64 % ; ; GCN2-LABEL: atomic_dec_i32_decr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_dec v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7629,11 +7629,11 @@ define amdgpu_kernel void @atomic_dec_i32_decr64_offset(ptr %out, i32 %in, i64 % ; ; GCN3-LABEL: atomic_dec_i32_decr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -7653,18 +7653,18 @@ entry: define amdgpu_kernel void @atomic_dec_i32_ret_decr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_dec_i32_ret_decr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_dec v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7675,18 +7675,18 @@ define amdgpu_kernel void @atomic_dec_i32_ret_decr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_dec_i32_ret_decr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_dec v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7697,11 +7697,11 @@ define amdgpu_kernel void @atomic_dec_i32_ret_decr64_offset(ptr %out, ptr %out2, ; ; GCN3-LABEL: atomic_dec_i32_ret_decr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -7725,12 +7725,12 @@ entry: define amdgpu_kernel void @atomic_dec_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_dec_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_dec v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7738,12 +7738,12 @@ define amdgpu_kernel void @atomic_dec_i32(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_dec_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_dec v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7751,11 +7751,11 @@ define amdgpu_kernel void @atomic_dec_i32(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_dec_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_dec v[0:1], v2 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7769,8 +7769,8 @@ entry: define amdgpu_kernel void @atomic_dec_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_dec_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -7785,8 +7785,8 @@ define amdgpu_kernel void @atomic_dec_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN2-LABEL: atomic_dec_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -7801,12 +7801,12 @@ define amdgpu_kernel void @atomic_dec_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN3-LABEL: atomic_dec_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: flat_atomic_dec v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -7823,16 +7823,16 @@ entry: define amdgpu_kernel void @atomic_dec_i32_decr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_dec_i32_decr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_dec v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7840,16 +7840,16 @@ define amdgpu_kernel void @atomic_dec_i32_decr64(ptr %out, i32 %in, i64 %index) ; ; GCN2-LABEL: atomic_dec_i32_decr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_dec v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7857,11 +7857,11 @@ define amdgpu_kernel void @atomic_dec_i32_decr64(ptr %out, i32 %in, i64 %index) ; ; GCN3-LABEL: atomic_dec_i32_decr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -7880,16 +7880,16 @@ entry: define amdgpu_kernel void @atomic_dec_i32_ret_decr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_dec_i32_ret_decr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_dec v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7900,16 +7900,16 @@ define amdgpu_kernel void @atomic_dec_i32_ret_decr64(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_dec_i32_ret_decr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_dec v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7920,11 +7920,11 @@ define amdgpu_kernel void @atomic_dec_i32_ret_decr64(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_dec_i32_ret_decr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -7947,7 +7947,7 @@ entry: define amdgpu_kernel void @atomic_load_f16_offset(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_f16_offset: ; GCN1: ; %bb.0: -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -7963,7 +7963,7 @@ define amdgpu_kernel void @atomic_load_f16_offset(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_f16_offset: ; GCN2: ; %bb.0: -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -7979,7 +7979,7 @@ define amdgpu_kernel void @atomic_load_f16_offset(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_f16_offset: ; GCN3: ; %bb.0: -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 @@ -7999,7 +7999,7 @@ define amdgpu_kernel void @atomic_load_f16_offset(ptr %in, ptr %out) { define amdgpu_kernel void @atomic_load_f16(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_f16: ; GCN1: ; %bb.0: -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -8013,7 +8013,7 @@ define amdgpu_kernel void @atomic_load_f16(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_f16: ; GCN2: ; %bb.0: -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -8027,7 +8027,7 @@ define amdgpu_kernel void @atomic_load_f16(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_f16: ; GCN3: ; %bb.0: -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 @@ -8046,7 +8046,7 @@ define amdgpu_kernel void @atomic_load_f16(ptr %in, ptr %out) { define amdgpu_kernel void @atomic_load_bf16_offset(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_bf16_offset: ; GCN1: ; %bb.0: -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -8062,7 +8062,7 @@ define amdgpu_kernel void @atomic_load_bf16_offset(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_bf16_offset: ; GCN2: ; %bb.0: -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -8078,7 +8078,7 @@ define amdgpu_kernel void @atomic_load_bf16_offset(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_bf16_offset: ; GCN3: ; %bb.0: -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 @@ -8098,7 +8098,7 @@ define amdgpu_kernel void @atomic_load_bf16_offset(ptr %in, ptr %out) { define amdgpu_kernel void @atomic_load_bf16(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_bf16: ; GCN1: ; %bb.0: -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -8112,7 +8112,7 @@ define amdgpu_kernel void @atomic_load_bf16(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_bf16: ; GCN2: ; %bb.0: -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -8126,7 +8126,7 @@ define amdgpu_kernel void @atomic_load_bf16(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_bf16: ; GCN3: ; %bb.0: -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll index 66aacd7062a6d..9c2faf622623d 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll @@ -3823,7 +3823,7 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_offset_scalar(ptr inreg %out, i32 define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_max_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_ashr_i32 s5, s3, 31 ; GCN1-NEXT: s_mov_b32 s4, s3 @@ -3853,7 +3853,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 % ; ; GCN2-LABEL: atomic_max_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_ashr_i32 s5, s3, 31 ; GCN2-NEXT: s_mov_b32 s4, s3 @@ -3883,7 +3883,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 % ; ; GCN3-LABEL: atomic_max_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_ashr_i32 s1, s7, 31 ; GCN3-NEXT: s_mov_b32 s0, s7 @@ -3918,8 +3918,8 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_max_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_ashr_i32 s7, s5, 31 ; GCN1-NEXT: s_mov_b32 s6, s5 @@ -3953,8 +3953,8 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_max_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_ashr_i32 s7, s5, 31 ; GCN2-NEXT: s_mov_b32 s6, s5 @@ -3988,32 +3988,32 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN3-LABEL: atomic_max_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_ashr_i32 s1, s3, 31 -; GCN3-NEXT: s_mov_b32 s0, s3 -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: s_ashr_i32 s3, s1, 31 +; GCN3-NEXT: s_mov_b32 s2, s1 +; GCN3-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; GCN3-NEXT: s_add_u32 s2, s4, s2 +; GCN3-NEXT: s_addc_u32 s3, s5, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 -; GCN3-NEXT: s_mov_b64 s[0:1], 0 +; GCN3-NEXT: s_mov_b64 s[2:3], 0 ; GCN3-NEXT: .LBB89_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: v_max_i32_e32 v2, s2, v3 +; GCN3-NEXT: v_max_i32_e32 v2, s0, v3 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN3-NEXT: s_cbranch_execnz .LBB89_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN3-NEXT: s_or_b64 exec, exec, s[2:3] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: flat_store_dword v[0:1], v2 @@ -4029,7 +4029,7 @@ entry: define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_max_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_ashr_i32 s5, s3, 31 ; GCN1-NEXT: s_mov_b32 s4, s3 @@ -4057,7 +4057,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index) ; ; GCN2-LABEL: atomic_max_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_ashr_i32 s5, s3, 31 ; GCN2-NEXT: s_mov_b32 s4, s3 @@ -4085,7 +4085,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index) ; ; GCN3-LABEL: atomic_max_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_ashr_i32 s1, s7, 31 ; GCN3-NEXT: s_mov_b32 s0, s7 @@ -4119,8 +4119,8 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_max_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_ashr_i32 s7, s5, 31 ; GCN1-NEXT: s_mov_b32 s6, s5 @@ -4152,8 +4152,8 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_max_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_ashr_i32 s7, s5, 31 ; GCN2-NEXT: s_mov_b32 s6, s5 @@ -4185,32 +4185,32 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_max_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_ashr_i32 s1, s3, 31 -; GCN3-NEXT: s_mov_b32 s0, s3 -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: s_ashr_i32 s3, s1, 31 +; GCN3-NEXT: s_mov_b32 s2, s1 +; GCN3-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; GCN3-NEXT: s_add_u32 s2, s4, s2 +; GCN3-NEXT: s_addc_u32 s3, s5, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_load_dword v2, v[0:1] -; GCN3-NEXT: s_mov_b64 s[0:1], 0 +; GCN3-NEXT: s_mov_b64 s[2:3], 0 ; GCN3-NEXT: .LBB91_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: v_max_i32_e32 v2, s2, v3 +; GCN3-NEXT: v_max_i32_e32 v2, s0, v3 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN3-NEXT: s_cbranch_execnz .LBB91_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN3-NEXT: s_or_b64 exec, exec, s[2:3] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: flat_store_dword v[0:1], v2 @@ -4966,7 +4966,7 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_offset_scalar(ptr inreg %out, i3 define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_umax_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_ashr_i32 s5, s3, 31 ; GCN1-NEXT: s_mov_b32 s4, s3 @@ -4996,7 +4996,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32 ; ; GCN2-LABEL: atomic_umax_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_ashr_i32 s5, s3, 31 ; GCN2-NEXT: s_mov_b32 s4, s3 @@ -5026,7 +5026,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32 ; ; GCN3-LABEL: atomic_umax_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_ashr_i32 s1, s7, 31 ; GCN3-NEXT: s_mov_b32 s0, s7 @@ -5061,8 +5061,8 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_umax_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_ashr_i32 s7, s5, 31 ; GCN1-NEXT: s_mov_b32 s6, s5 @@ -5096,8 +5096,8 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2 ; ; GCN2-LABEL: atomic_umax_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_ashr_i32 s7, s5, 31 ; GCN2-NEXT: s_mov_b32 s6, s5 @@ -5131,32 +5131,32 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2 ; ; GCN3-LABEL: atomic_umax_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_ashr_i32 s1, s3, 31 -; GCN3-NEXT: s_mov_b32 s0, s3 -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: s_ashr_i32 s3, s1, 31 +; GCN3-NEXT: s_mov_b32 s2, s1 +; GCN3-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; GCN3-NEXT: s_add_u32 s2, s4, s2 +; GCN3-NEXT: s_addc_u32 s3, s5, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 -; GCN3-NEXT: s_mov_b64 s[0:1], 0 +; GCN3-NEXT: s_mov_b64 s[2:3], 0 ; GCN3-NEXT: .LBB103_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: v_max_u32_e32 v2, s2, v3 +; GCN3-NEXT: v_max_u32_e32 v2, s0, v3 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN3-NEXT: s_cbranch_execnz .LBB103_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN3-NEXT: s_or_b64 exec, exec, s[2:3] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: flat_store_dword v[0:1], v2 @@ -5172,8 +5172,8 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_umax_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_ashr_i32 s7, s5, 31 ; GCN1-NEXT: s_mov_b32 s6, s5 @@ -5205,8 +5205,8 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; ; GCN2-LABEL: atomic_umax_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_ashr_i32 s7, s5, 31 ; GCN2-NEXT: s_mov_b32 s6, s5 @@ -5238,32 +5238,32 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; ; GCN3-LABEL: atomic_umax_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_ashr_i32 s1, s3, 31 -; GCN3-NEXT: s_mov_b32 s0, s3 -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: s_ashr_i32 s3, s1, 31 +; GCN3-NEXT: s_mov_b32 s2, s1 +; GCN3-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; GCN3-NEXT: s_add_u32 s2, s4, s2 +; GCN3-NEXT: s_addc_u32 s3, s5, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_load_dword v2, v[0:1] -; GCN3-NEXT: s_mov_b64 s[0:1], 0 +; GCN3-NEXT: s_mov_b64 s[2:3], 0 ; GCN3-NEXT: .LBB104_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: v_max_u32_e32 v2, s2, v3 +; GCN3-NEXT: v_max_u32_e32 v2, s0, v3 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN3-NEXT: s_cbranch_execnz .LBB104_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN3-NEXT: s_or_b64 exec, exec, s[2:3] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: flat_store_dword v[0:1], v2 @@ -6760,7 +6760,7 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_offset_scalar(ptr inreg %out, i32 define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_min_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_ashr_i32 s5, s3, 31 ; GCN1-NEXT: s_mov_b32 s4, s3 @@ -6790,7 +6790,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 % ; ; GCN2-LABEL: atomic_min_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_ashr_i32 s5, s3, 31 ; GCN2-NEXT: s_mov_b32 s4, s3 @@ -6820,7 +6820,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 % ; ; GCN3-LABEL: atomic_min_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_ashr_i32 s1, s7, 31 ; GCN3-NEXT: s_mov_b32 s0, s7 @@ -6855,8 +6855,8 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_min_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_ashr_i32 s7, s5, 31 ; GCN1-NEXT: s_mov_b32 s6, s5 @@ -6890,8 +6890,8 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_min_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_ashr_i32 s7, s5, 31 ; GCN2-NEXT: s_mov_b32 s6, s5 @@ -6925,32 +6925,32 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN3-LABEL: atomic_min_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_ashr_i32 s1, s3, 31 -; GCN3-NEXT: s_mov_b32 s0, s3 -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: s_ashr_i32 s3, s1, 31 +; GCN3-NEXT: s_mov_b32 s2, s1 +; GCN3-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; GCN3-NEXT: s_add_u32 s2, s4, s2 +; GCN3-NEXT: s_addc_u32 s3, s5, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 -; GCN3-NEXT: s_mov_b64 s[0:1], 0 +; GCN3-NEXT: s_mov_b64 s[2:3], 0 ; GCN3-NEXT: .LBB126_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: v_min_i32_e32 v2, s2, v3 +; GCN3-NEXT: v_min_i32_e32 v2, s0, v3 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN3-NEXT: s_cbranch_execnz .LBB126_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN3-NEXT: s_or_b64 exec, exec, s[2:3] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: flat_store_dword v[0:1], v2 @@ -6966,8 +6966,8 @@ entry: define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_min_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s2, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_mov_b64 s[0:1], 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 @@ -6990,8 +6990,8 @@ define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_min_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s2, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_mov_b64 s[0:1], 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 @@ -7014,17 +7014,17 @@ define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_min_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s2, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: v_mov_b32_e32 v0, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: flat_load_dword v3, v[0:1] ; GCN3-NEXT: .LBB127_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_min_i32_e32 v2, s2, v3 +; GCN3-NEXT: v_min_i32_e32 v2, s4, v3 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -7043,8 +7043,8 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_min_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_ashr_i32 s7, s5, 31 ; GCN1-NEXT: s_mov_b32 s6, s5 @@ -7076,8 +7076,8 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_min_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_ashr_i32 s7, s5, 31 ; GCN2-NEXT: s_mov_b32 s6, s5 @@ -7109,32 +7109,32 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_min_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_ashr_i32 s1, s3, 31 -; GCN3-NEXT: s_mov_b32 s0, s3 -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: s_ashr_i32 s3, s1, 31 +; GCN3-NEXT: s_mov_b32 s2, s1 +; GCN3-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; GCN3-NEXT: s_add_u32 s2, s4, s2 +; GCN3-NEXT: s_addc_u32 s3, s5, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_load_dword v2, v[0:1] -; GCN3-NEXT: s_mov_b64 s[0:1], 0 +; GCN3-NEXT: s_mov_b64 s[2:3], 0 ; GCN3-NEXT: .LBB128_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: v_min_i32_e32 v2, s2, v3 +; GCN3-NEXT: v_min_i32_e32 v2, s0, v3 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN3-NEXT: s_cbranch_execnz .LBB128_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN3-NEXT: s_or_b64 exec, exec, s[2:3] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: flat_store_dword v[0:1], v2 diff --git a/llvm/test/CodeGen/AMDGPU/fma-combine.ll b/llvm/test/CodeGen/AMDGPU/fma-combine.ll index bac2d8b8b40c2..4846e21fe836e 100644 --- a/llvm/test/CodeGen/AMDGPU/fma-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/fma-combine.ll @@ -21,7 +21,7 @@ declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) #0 define amdgpu_kernel void @combine_to_fma_f64_0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-LABEL: combine_to_fma_f64_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -41,7 +41,9 @@ define amdgpu_kernel void @combine_to_fma_f64_0(ptr addrspace(1) noalias %out, p ; ; GFX11-LABEL: combine_to_fma_f64_0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3] glc dlc @@ -75,7 +77,7 @@ define amdgpu_kernel void @combine_to_fma_f64_0(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @combine_to_fma_f64_0_2use(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-LABEL: combine_to_fma_f64_0_2use: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -101,7 +103,9 @@ define amdgpu_kernel void @combine_to_fma_f64_0_2use(ptr addrspace(1) noalias %o ; ; GFX11-LABEL: combine_to_fma_f64_0_2use: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v8, s[2:3] glc dlc @@ -146,7 +150,7 @@ define amdgpu_kernel void @combine_to_fma_f64_0_2use(ptr addrspace(1) noalias %o define amdgpu_kernel void @combine_to_fma_f64_1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-LABEL: combine_to_fma_f64_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -166,7 +170,9 @@ define amdgpu_kernel void @combine_to_fma_f64_1(ptr addrspace(1) noalias %out, p ; ; GFX11-LABEL: combine_to_fma_f64_1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3] glc dlc @@ -200,7 +206,7 @@ define amdgpu_kernel void @combine_to_fma_f64_1(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @combine_to_fma_fsub_0_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-LABEL: combine_to_fma_fsub_0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -220,7 +226,9 @@ define amdgpu_kernel void @combine_to_fma_fsub_0_f64(ptr addrspace(1) noalias %o ; ; GFX11-LABEL: combine_to_fma_fsub_0_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3] glc dlc @@ -254,7 +262,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_0_f64(ptr addrspace(1) noalias %o define amdgpu_kernel void @combine_to_fma_fsub_f64_0_2use(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-LABEL: combine_to_fma_fsub_f64_0_2use: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -280,7 +288,9 @@ define amdgpu_kernel void @combine_to_fma_fsub_f64_0_2use(ptr addrspace(1) noali ; ; GFX11-LABEL: combine_to_fma_fsub_f64_0_2use: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v8, s[2:3] glc dlc @@ -325,7 +335,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_f64_0_2use(ptr addrspace(1) noali define amdgpu_kernel void @combine_to_fma_fsub_1_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-LABEL: combine_to_fma_fsub_1_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -345,7 +355,9 @@ define amdgpu_kernel void @combine_to_fma_fsub_1_f64(ptr addrspace(1) noalias %o ; ; GFX11-LABEL: combine_to_fma_fsub_1_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3] glc dlc @@ -379,7 +391,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_1_f64(ptr addrspace(1) noalias %o define amdgpu_kernel void @combine_to_fma_fsub_1_f64_2use(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-LABEL: combine_to_fma_fsub_1_f64_2use: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -405,7 +417,9 @@ define amdgpu_kernel void @combine_to_fma_fsub_1_f64_2use(ptr addrspace(1) noali ; ; GFX11-LABEL: combine_to_fma_fsub_1_f64_2use: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v8, s[2:3] glc dlc @@ -450,7 +464,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_1_f64_2use(ptr addrspace(1) noali define amdgpu_kernel void @combine_to_fma_fsub_2_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-LABEL: combine_to_fma_fsub_2_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -470,7 +484,9 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64(ptr addrspace(1) noalias %o ; ; GFX11-LABEL: combine_to_fma_fsub_2_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3] glc dlc @@ -506,7 +522,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64(ptr addrspace(1) noalias %o define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_neg(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-LABEL: combine_to_fma_fsub_2_f64_2uses_neg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -532,7 +548,9 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_neg(ptr addrspace(1) ; ; GFX11-LABEL: combine_to_fma_fsub_2_f64_2uses_neg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v8, s[2:3] glc dlc @@ -579,7 +597,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_neg(ptr addrspace(1) define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_mul(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-LABEL: combine_to_fma_fsub_2_f64_2uses_mul: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -605,7 +623,9 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_mul(ptr addrspace(1) ; ; GFX11-LABEL: combine_to_fma_fsub_2_f64_2uses_mul: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v8, s[2:3] glc dlc @@ -652,7 +672,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_mul(ptr addrspace(1) define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-NOFMA-LABEL: aggressive_combine_to_fma_fsub_0_f64: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NOFMA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOFMA-NEXT: s_mov_b32 s6, 0 ; SI-NOFMA-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -678,7 +698,7 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(ptr addrspace(1) ; ; SI-FMA-LABEL: aggressive_combine_to_fma_fsub_0_f64: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-FMA-NEXT: s_mov_b32 s7, 0xf000 ; SI-FMA-NEXT: s_mov_b32 s6, 0 ; SI-FMA-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -703,7 +723,9 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(ptr addrspace(1) ; ; GFX11-NOFMA-LABEL: aggressive_combine_to_fma_fsub_0_f64: ; GFX11-NOFMA: ; %bb.0: -; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NOFMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NOFMA-NEXT: v_lshlrev_b32_e32 v10, 3, v0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: global_load_b64 v[0:1], v10, s[2:3] glc dlc @@ -727,7 +749,9 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(ptr addrspace(1) ; ; GFX11-FMA-LABEL: aggressive_combine_to_fma_fsub_0_f64: ; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v10, 3, v0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: global_load_b64 v[0:1], v10, s[2:3] glc dlc @@ -774,7 +798,7 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(ptr addrspace(1) define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-NOFMA-LABEL: aggressive_combine_to_fma_fsub_1_f64: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NOFMA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOFMA-NEXT: s_mov_b32 s6, 0 ; SI-NOFMA-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -800,7 +824,7 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(ptr addrspace(1) ; ; SI-FMA-LABEL: aggressive_combine_to_fma_fsub_1_f64: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-FMA-NEXT: s_mov_b32 s7, 0xf000 ; SI-FMA-NEXT: s_mov_b32 s6, 0 ; SI-FMA-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -825,7 +849,9 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(ptr addrspace(1) ; ; GFX11-NOFMA-LABEL: aggressive_combine_to_fma_fsub_1_f64: ; GFX11-NOFMA: ; %bb.0: -; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NOFMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NOFMA-NEXT: v_lshlrev_b32_e32 v10, 3, v0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: global_load_b64 v[0:1], v10, s[2:3] glc dlc @@ -849,7 +875,9 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(ptr addrspace(1) ; ; GFX11-FMA-LABEL: aggressive_combine_to_fma_fsub_1_f64: ; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v10, 3, v0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: global_load_b64 v[0:1], v10, s[2:3] glc dlc @@ -899,56 +927,56 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(ptr addrspace(1) define amdgpu_kernel void @test_f32_mul_add_x_one_y(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_add_x_one_y: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s2, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s2 +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s10, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s10 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-NOFMA-NEXT: s_mov_b32 s12, s6 ; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s3 -; SI-NOFMA-NEXT: s_mov_b32 s10, s2 -; SI-NOFMA-NEXT: s_mov_b32 s11, s3 +; SI-NOFMA-NEXT: s_mov_b32 s15, s11 +; SI-NOFMA-NEXT: s_mov_b32 s2, s10 +; SI-NOFMA-NEXT: s_mov_b32 s3, s11 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 glc +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) -; SI-NOFMA-NEXT: s_mov_b32 s0, s4 -; SI-NOFMA-NEXT: s_mov_b32 s1, s5 +; SI-NOFMA-NEXT: s_mov_b32 s8, s4 +; SI-NOFMA-NEXT: s_mov_b32 s9, s5 ; SI-NOFMA-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_add_x_one_y: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s2, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s2 +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s10, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s10 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-FMA-NEXT: s_mov_b32 s12, s6 ; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s3 -; SI-FMA-NEXT: s_mov_b32 s10, s2 -; SI-FMA-NEXT: s_mov_b32 s11, s3 +; SI-FMA-NEXT: s_mov_b32 s15, s11 +; SI-FMA-NEXT: s_mov_b32 s2, s10 +; SI-FMA-NEXT: s_mov_b32 s3, s11 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc ; SI-FMA-NEXT: s_waitcnt vmcnt(0) -; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 glc +; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc ; SI-FMA-NEXT: s_waitcnt vmcnt(0) -; SI-FMA-NEXT: s_mov_b32 s0, s4 -; SI-FMA-NEXT: s_mov_b32 s1, s5 +; SI-FMA-NEXT: s_mov_b32 s8, s4 +; SI-FMA-NEXT: s_mov_b32 s9, s5 ; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_add_x_one_y: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -966,8 +994,8 @@ define amdgpu_kernel void @test_f32_mul_add_x_one_y(ptr addrspace(1) %out, ; GFX11-FMA-LABEL: test_f32_mul_add_x_one_y: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -992,56 +1020,56 @@ define amdgpu_kernel void @test_f32_mul_add_x_one_y(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_mul_y_add_x_one(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_y_add_x_one: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s2, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s2 +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s10, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s10 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-NOFMA-NEXT: s_mov_b32 s12, s6 ; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s3 -; SI-NOFMA-NEXT: s_mov_b32 s10, s2 -; SI-NOFMA-NEXT: s_mov_b32 s11, s3 +; SI-NOFMA-NEXT: s_mov_b32 s15, s11 +; SI-NOFMA-NEXT: s_mov_b32 s2, s10 +; SI-NOFMA-NEXT: s_mov_b32 s3, s11 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 glc +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) -; SI-NOFMA-NEXT: s_mov_b32 s0, s4 -; SI-NOFMA-NEXT: s_mov_b32 s1, s5 +; SI-NOFMA-NEXT: s_mov_b32 s8, s4 +; SI-NOFMA-NEXT: s_mov_b32 s9, s5 ; SI-NOFMA-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_y_add_x_one: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s2, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s2 +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s10, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s10 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-FMA-NEXT: s_mov_b32 s12, s6 ; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s3 -; SI-FMA-NEXT: s_mov_b32 s10, s2 -; SI-FMA-NEXT: s_mov_b32 s11, s3 +; SI-FMA-NEXT: s_mov_b32 s15, s11 +; SI-FMA-NEXT: s_mov_b32 s2, s10 +; SI-FMA-NEXT: s_mov_b32 s3, s11 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc ; SI-FMA-NEXT: s_waitcnt vmcnt(0) -; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 glc +; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc ; SI-FMA-NEXT: s_waitcnt vmcnt(0) -; SI-FMA-NEXT: s_mov_b32 s0, s4 -; SI-FMA-NEXT: s_mov_b32 s1, s5 +; SI-FMA-NEXT: s_mov_b32 s8, s4 +; SI-FMA-NEXT: s_mov_b32 s9, s5 ; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_y_add_x_one: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -1059,8 +1087,8 @@ define amdgpu_kernel void @test_f32_mul_y_add_x_one(ptr addrspace(1) %out, ; GFX11-FMA-LABEL: test_f32_mul_y_add_x_one: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -1085,55 +1113,55 @@ define amdgpu_kernel void @test_f32_mul_y_add_x_one(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_mul_add_x_negone_y(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_add_x_negone_y: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s2, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s2 +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s10, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s10 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-NOFMA-NEXT: s_mov_b32 s12, s6 ; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s3 -; SI-NOFMA-NEXT: s_mov_b32 s10, s2 -; SI-NOFMA-NEXT: s_mov_b32 s11, s3 +; SI-NOFMA-NEXT: s_mov_b32 s15, s11 +; SI-NOFMA-NEXT: s_mov_b32 s2, s10 +; SI-NOFMA-NEXT: s_mov_b32 s3, s11 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 -; SI-NOFMA-NEXT: s_mov_b32 s0, s4 -; SI-NOFMA-NEXT: s_mov_b32 s1, s5 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; SI-NOFMA-NEXT: s_mov_b32 s8, s4 +; SI-NOFMA-NEXT: s_mov_b32 s9, s5 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) ; SI-NOFMA-NEXT: v_add_f32_e32 v0, -1.0, v0 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_add_x_negone_y: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s2, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s2 +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s10, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s10 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-FMA-NEXT: s_mov_b32 s12, s6 ; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s3 -; SI-FMA-NEXT: s_mov_b32 s10, s2 -; SI-FMA-NEXT: s_mov_b32 s11, s3 +; SI-FMA-NEXT: s_mov_b32 s15, s11 +; SI-FMA-NEXT: s_mov_b32 s2, s10 +; SI-FMA-NEXT: s_mov_b32 s3, s11 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 -; SI-FMA-NEXT: s_mov_b32 s0, s4 -; SI-FMA-NEXT: s_mov_b32 s1, s5 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; SI-FMA-NEXT: s_mov_b32 s8, s4 +; SI-FMA-NEXT: s_mov_b32 s9, s5 ; SI-FMA-NEXT: s_waitcnt vmcnt(0) ; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, -v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_add_x_negone_y: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x1 @@ -1152,8 +1180,8 @@ define amdgpu_kernel void @test_f32_mul_add_x_negone_y(ptr addrspace(1) %out, ; GFX11-FMA-LABEL: test_f32_mul_add_x_negone_y: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -1178,55 +1206,55 @@ define amdgpu_kernel void @test_f32_mul_add_x_negone_y(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_mul_y_add_x_negone(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_y_add_x_negone: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s2, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s2 +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s10, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s10 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-NOFMA-NEXT: s_mov_b32 s12, s6 ; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s3 -; SI-NOFMA-NEXT: s_mov_b32 s10, s2 -; SI-NOFMA-NEXT: s_mov_b32 s11, s3 +; SI-NOFMA-NEXT: s_mov_b32 s15, s11 +; SI-NOFMA-NEXT: s_mov_b32 s2, s10 +; SI-NOFMA-NEXT: s_mov_b32 s3, s11 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 -; SI-NOFMA-NEXT: s_mov_b32 s0, s4 -; SI-NOFMA-NEXT: s_mov_b32 s1, s5 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; SI-NOFMA-NEXT: s_mov_b32 s8, s4 +; SI-NOFMA-NEXT: s_mov_b32 s9, s5 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) ; SI-NOFMA-NEXT: v_add_f32_e32 v0, -1.0, v0 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_y_add_x_negone: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s2, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s2 +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s10, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s10 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-FMA-NEXT: s_mov_b32 s12, s6 ; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s3 -; SI-FMA-NEXT: s_mov_b32 s10, s2 -; SI-FMA-NEXT: s_mov_b32 s11, s3 +; SI-FMA-NEXT: s_mov_b32 s15, s11 +; SI-FMA-NEXT: s_mov_b32 s2, s10 +; SI-FMA-NEXT: s_mov_b32 s3, s11 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 -; SI-FMA-NEXT: s_mov_b32 s0, s4 -; SI-FMA-NEXT: s_mov_b32 s1, s5 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; SI-FMA-NEXT: s_mov_b32 s8, s4 +; SI-FMA-NEXT: s_mov_b32 s9, s5 ; SI-FMA-NEXT: s_waitcnt vmcnt(0) ; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, -v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_y_add_x_negone: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x1 @@ -1245,8 +1273,8 @@ define amdgpu_kernel void @test_f32_mul_y_add_x_negone(ptr addrspace(1) %out, ; GFX11-FMA-LABEL: test_f32_mul_y_add_x_negone: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -1271,55 +1299,55 @@ define amdgpu_kernel void @test_f32_mul_y_add_x_negone(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_mul_sub_one_x_y(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_sub_one_x_y: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s2, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s2 +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s10, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s10 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-NOFMA-NEXT: s_mov_b32 s12, s6 ; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s3 -; SI-NOFMA-NEXT: s_mov_b32 s10, s2 -; SI-NOFMA-NEXT: s_mov_b32 s11, s3 +; SI-NOFMA-NEXT: s_mov_b32 s15, s11 +; SI-NOFMA-NEXT: s_mov_b32 s2, s10 +; SI-NOFMA-NEXT: s_mov_b32 s3, s11 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 -; SI-NOFMA-NEXT: s_mov_b32 s0, s4 -; SI-NOFMA-NEXT: s_mov_b32 s1, s5 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; SI-NOFMA-NEXT: s_mov_b32 s8, s4 +; SI-NOFMA-NEXT: s_mov_b32 s9, s5 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) ; SI-NOFMA-NEXT: v_sub_f32_e32 v0, 1.0, v0 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_sub_one_x_y: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s2, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s2 +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s10, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s10 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-FMA-NEXT: s_mov_b32 s12, s6 ; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s3 -; SI-FMA-NEXT: s_mov_b32 s10, s2 -; SI-FMA-NEXT: s_mov_b32 s11, s3 +; SI-FMA-NEXT: s_mov_b32 s15, s11 +; SI-FMA-NEXT: s_mov_b32 s2, s10 +; SI-FMA-NEXT: s_mov_b32 s3, s11 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 -; SI-FMA-NEXT: s_mov_b32 s0, s4 -; SI-FMA-NEXT: s_mov_b32 s1, s5 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; SI-FMA-NEXT: s_mov_b32 s8, s4 +; SI-FMA-NEXT: s_mov_b32 s9, s5 ; SI-FMA-NEXT: s_waitcnt vmcnt(0) ; SI-FMA-NEXT: v_fma_f32 v0, -v0, v1, v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_sub_one_x_y: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x1 @@ -1338,8 +1366,8 @@ define amdgpu_kernel void @test_f32_mul_sub_one_x_y(ptr addrspace(1) %out, ; GFX11-FMA-LABEL: test_f32_mul_sub_one_x_y: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -1364,55 +1392,55 @@ define amdgpu_kernel void @test_f32_mul_sub_one_x_y(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_mul_y_sub_one_x(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_y_sub_one_x: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s2, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s2 +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s10, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s10 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-NOFMA-NEXT: s_mov_b32 s12, s6 ; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s3 -; SI-NOFMA-NEXT: s_mov_b32 s10, s2 -; SI-NOFMA-NEXT: s_mov_b32 s11, s3 +; SI-NOFMA-NEXT: s_mov_b32 s15, s11 +; SI-NOFMA-NEXT: s_mov_b32 s2, s10 +; SI-NOFMA-NEXT: s_mov_b32 s3, s11 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 -; SI-NOFMA-NEXT: s_mov_b32 s0, s4 -; SI-NOFMA-NEXT: s_mov_b32 s1, s5 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; SI-NOFMA-NEXT: s_mov_b32 s8, s4 +; SI-NOFMA-NEXT: s_mov_b32 s9, s5 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) ; SI-NOFMA-NEXT: v_sub_f32_e32 v0, 1.0, v0 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_y_sub_one_x: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s2, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s2 +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s10, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s10 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-FMA-NEXT: s_mov_b32 s12, s6 ; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s3 -; SI-FMA-NEXT: s_mov_b32 s10, s2 -; SI-FMA-NEXT: s_mov_b32 s11, s3 +; SI-FMA-NEXT: s_mov_b32 s15, s11 +; SI-FMA-NEXT: s_mov_b32 s2, s10 +; SI-FMA-NEXT: s_mov_b32 s3, s11 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 -; SI-FMA-NEXT: s_mov_b32 s0, s4 -; SI-FMA-NEXT: s_mov_b32 s1, s5 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; SI-FMA-NEXT: s_mov_b32 s8, s4 +; SI-FMA-NEXT: s_mov_b32 s9, s5 ; SI-FMA-NEXT: s_waitcnt vmcnt(0) ; SI-FMA-NEXT: v_fma_f32 v0, -v0, v1, v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_y_sub_one_x: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x1 @@ -1431,8 +1459,8 @@ define amdgpu_kernel void @test_f32_mul_y_sub_one_x(ptr addrspace(1) %out, ; GFX11-FMA-LABEL: test_f32_mul_y_sub_one_x: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -1457,55 +1485,55 @@ define amdgpu_kernel void @test_f32_mul_y_sub_one_x(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_sub_negone_x_y: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s2, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s2 +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s10, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s10 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-NOFMA-NEXT: s_mov_b32 s12, s6 ; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s3 -; SI-NOFMA-NEXT: s_mov_b32 s10, s2 -; SI-NOFMA-NEXT: s_mov_b32 s11, s3 +; SI-NOFMA-NEXT: s_mov_b32 s15, s11 +; SI-NOFMA-NEXT: s_mov_b32 s2, s10 +; SI-NOFMA-NEXT: s_mov_b32 s3, s11 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 -; SI-NOFMA-NEXT: s_mov_b32 s0, s4 -; SI-NOFMA-NEXT: s_mov_b32 s1, s5 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; SI-NOFMA-NEXT: s_mov_b32 s8, s4 +; SI-NOFMA-NEXT: s_mov_b32 s9, s5 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) ; SI-NOFMA-NEXT: v_sub_f32_e32 v0, -1.0, v0 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_sub_negone_x_y: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s2, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s2 +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s10, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s10 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-FMA-NEXT: s_mov_b32 s12, s6 ; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s3 -; SI-FMA-NEXT: s_mov_b32 s10, s2 -; SI-FMA-NEXT: s_mov_b32 s11, s3 +; SI-FMA-NEXT: s_mov_b32 s15, s11 +; SI-FMA-NEXT: s_mov_b32 s2, s10 +; SI-FMA-NEXT: s_mov_b32 s3, s11 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 -; SI-FMA-NEXT: s_mov_b32 s0, s4 -; SI-FMA-NEXT: s_mov_b32 s1, s5 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; SI-FMA-NEXT: s_mov_b32 s8, s4 +; SI-FMA-NEXT: s_mov_b32 s9, s5 ; SI-FMA-NEXT: s_waitcnt vmcnt(0) ; SI-FMA-NEXT: v_fma_f32 v0, -v0, v1, -v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_sub_negone_x_y: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x1 @@ -1524,8 +1552,8 @@ define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(ptr addrspace(1) %out, ; GFX11-FMA-LABEL: test_f32_mul_sub_negone_x_y: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -1550,55 +1578,55 @@ define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_mul_y_sub_negone_x(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_y_sub_negone_x: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s2, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s2 +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s10, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s10 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-NOFMA-NEXT: s_mov_b32 s12, s6 ; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s3 -; SI-NOFMA-NEXT: s_mov_b32 s10, s2 -; SI-NOFMA-NEXT: s_mov_b32 s11, s3 +; SI-NOFMA-NEXT: s_mov_b32 s15, s11 +; SI-NOFMA-NEXT: s_mov_b32 s2, s10 +; SI-NOFMA-NEXT: s_mov_b32 s3, s11 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 -; SI-NOFMA-NEXT: s_mov_b32 s0, s4 -; SI-NOFMA-NEXT: s_mov_b32 s1, s5 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; SI-NOFMA-NEXT: s_mov_b32 s8, s4 +; SI-NOFMA-NEXT: s_mov_b32 s9, s5 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) ; SI-NOFMA-NEXT: v_sub_f32_e32 v0, -1.0, v0 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_y_sub_negone_x: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s2, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s2 +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s10, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s10 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-FMA-NEXT: s_mov_b32 s12, s6 ; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s3 -; SI-FMA-NEXT: s_mov_b32 s10, s2 -; SI-FMA-NEXT: s_mov_b32 s11, s3 +; SI-FMA-NEXT: s_mov_b32 s15, s11 +; SI-FMA-NEXT: s_mov_b32 s2, s10 +; SI-FMA-NEXT: s_mov_b32 s3, s11 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 -; SI-FMA-NEXT: s_mov_b32 s0, s4 -; SI-FMA-NEXT: s_mov_b32 s1, s5 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; SI-FMA-NEXT: s_mov_b32 s8, s4 +; SI-FMA-NEXT: s_mov_b32 s9, s5 ; SI-FMA-NEXT: s_waitcnt vmcnt(0) ; SI-FMA-NEXT: v_fma_f32 v0, -v0, v1, -v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_y_sub_negone_x: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x1 @@ -1617,8 +1645,8 @@ define amdgpu_kernel void @test_f32_mul_y_sub_negone_x(ptr addrspace(1) %out, ; GFX11-FMA-LABEL: test_f32_mul_y_sub_negone_x: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -1643,55 +1671,55 @@ define amdgpu_kernel void @test_f32_mul_y_sub_negone_x(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_mul_sub_x_one_y(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_sub_x_one_y: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s2, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s2 +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s10, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s10 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-NOFMA-NEXT: s_mov_b32 s12, s6 ; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s3 -; SI-NOFMA-NEXT: s_mov_b32 s10, s2 -; SI-NOFMA-NEXT: s_mov_b32 s11, s3 +; SI-NOFMA-NEXT: s_mov_b32 s15, s11 +; SI-NOFMA-NEXT: s_mov_b32 s2, s10 +; SI-NOFMA-NEXT: s_mov_b32 s3, s11 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 -; SI-NOFMA-NEXT: s_mov_b32 s0, s4 -; SI-NOFMA-NEXT: s_mov_b32 s1, s5 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; SI-NOFMA-NEXT: s_mov_b32 s8, s4 +; SI-NOFMA-NEXT: s_mov_b32 s9, s5 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) ; SI-NOFMA-NEXT: v_add_f32_e32 v0, -1.0, v0 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_sub_x_one_y: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s2, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s2 +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s10, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s10 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-FMA-NEXT: s_mov_b32 s12, s6 ; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s3 -; SI-FMA-NEXT: s_mov_b32 s10, s2 -; SI-FMA-NEXT: s_mov_b32 s11, s3 +; SI-FMA-NEXT: s_mov_b32 s15, s11 +; SI-FMA-NEXT: s_mov_b32 s2, s10 +; SI-FMA-NEXT: s_mov_b32 s3, s11 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 -; SI-FMA-NEXT: s_mov_b32 s0, s4 -; SI-FMA-NEXT: s_mov_b32 s1, s5 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; SI-FMA-NEXT: s_mov_b32 s8, s4 +; SI-FMA-NEXT: s_mov_b32 s9, s5 ; SI-FMA-NEXT: s_waitcnt vmcnt(0) ; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, -v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_sub_x_one_y: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x1 @@ -1710,8 +1738,8 @@ define amdgpu_kernel void @test_f32_mul_sub_x_one_y(ptr addrspace(1) %out, ; GFX11-FMA-LABEL: test_f32_mul_sub_x_one_y: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -1736,55 +1764,55 @@ define amdgpu_kernel void @test_f32_mul_sub_x_one_y(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_mul_y_sub_x_one(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_y_sub_x_one: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s2, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s2 +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s10, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s10 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-NOFMA-NEXT: s_mov_b32 s12, s6 ; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s3 -; SI-NOFMA-NEXT: s_mov_b32 s10, s2 -; SI-NOFMA-NEXT: s_mov_b32 s11, s3 +; SI-NOFMA-NEXT: s_mov_b32 s15, s11 +; SI-NOFMA-NEXT: s_mov_b32 s2, s10 +; SI-NOFMA-NEXT: s_mov_b32 s3, s11 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 -; SI-NOFMA-NEXT: s_mov_b32 s0, s4 -; SI-NOFMA-NEXT: s_mov_b32 s1, s5 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; SI-NOFMA-NEXT: s_mov_b32 s8, s4 +; SI-NOFMA-NEXT: s_mov_b32 s9, s5 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) ; SI-NOFMA-NEXT: v_add_f32_e32 v0, -1.0, v0 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_y_sub_x_one: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s2, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s2 +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s10, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s10 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-FMA-NEXT: s_mov_b32 s12, s6 ; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s3 -; SI-FMA-NEXT: s_mov_b32 s10, s2 -; SI-FMA-NEXT: s_mov_b32 s11, s3 +; SI-FMA-NEXT: s_mov_b32 s15, s11 +; SI-FMA-NEXT: s_mov_b32 s2, s10 +; SI-FMA-NEXT: s_mov_b32 s3, s11 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 -; SI-FMA-NEXT: s_mov_b32 s0, s4 -; SI-FMA-NEXT: s_mov_b32 s1, s5 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; SI-FMA-NEXT: s_mov_b32 s8, s4 +; SI-FMA-NEXT: s_mov_b32 s9, s5 ; SI-FMA-NEXT: s_waitcnt vmcnt(0) ; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, -v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_y_sub_x_one: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x1 @@ -1803,8 +1831,8 @@ define amdgpu_kernel void @test_f32_mul_y_sub_x_one(ptr addrspace(1) %out, ; GFX11-FMA-LABEL: test_f32_mul_y_sub_x_one: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -1829,55 +1857,55 @@ define amdgpu_kernel void @test_f32_mul_y_sub_x_one(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_mul_sub_x_negone_y(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_sub_x_negone_y: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s2, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s2 +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s10, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s10 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-NOFMA-NEXT: s_mov_b32 s12, s6 ; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s3 -; SI-NOFMA-NEXT: s_mov_b32 s10, s2 -; SI-NOFMA-NEXT: s_mov_b32 s11, s3 +; SI-NOFMA-NEXT: s_mov_b32 s15, s11 +; SI-NOFMA-NEXT: s_mov_b32 s2, s10 +; SI-NOFMA-NEXT: s_mov_b32 s3, s11 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 -; SI-NOFMA-NEXT: s_mov_b32 s0, s4 -; SI-NOFMA-NEXT: s_mov_b32 s1, s5 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; SI-NOFMA-NEXT: s_mov_b32 s8, s4 +; SI-NOFMA-NEXT: s_mov_b32 s9, s5 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) ; SI-NOFMA-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_sub_x_negone_y: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s2, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s2 +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s10, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s10 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-FMA-NEXT: s_mov_b32 s12, s6 ; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s3 -; SI-FMA-NEXT: s_mov_b32 s10, s2 -; SI-FMA-NEXT: s_mov_b32 s11, s3 +; SI-FMA-NEXT: s_mov_b32 s15, s11 +; SI-FMA-NEXT: s_mov_b32 s2, s10 +; SI-FMA-NEXT: s_mov_b32 s3, s11 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 -; SI-FMA-NEXT: s_mov_b32 s0, s4 -; SI-FMA-NEXT: s_mov_b32 s1, s5 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; SI-FMA-NEXT: s_mov_b32 s8, s4 +; SI-FMA-NEXT: s_mov_b32 s9, s5 ; SI-FMA-NEXT: s_waitcnt vmcnt(0) ; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_sub_x_negone_y: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x1 @@ -1896,8 +1924,8 @@ define amdgpu_kernel void @test_f32_mul_sub_x_negone_y(ptr addrspace(1) %out, ; GFX11-FMA-LABEL: test_f32_mul_sub_x_negone_y: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -1922,55 +1950,55 @@ define amdgpu_kernel void @test_f32_mul_sub_x_negone_y(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_mul_y_sub_x_negone(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_y_sub_x_negone: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s2, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s2 +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s10, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s10 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-NOFMA-NEXT: s_mov_b32 s12, s6 ; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s3 -; SI-NOFMA-NEXT: s_mov_b32 s10, s2 -; SI-NOFMA-NEXT: s_mov_b32 s11, s3 +; SI-NOFMA-NEXT: s_mov_b32 s15, s11 +; SI-NOFMA-NEXT: s_mov_b32 s2, s10 +; SI-NOFMA-NEXT: s_mov_b32 s3, s11 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 -; SI-NOFMA-NEXT: s_mov_b32 s0, s4 -; SI-NOFMA-NEXT: s_mov_b32 s1, s5 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; SI-NOFMA-NEXT: s_mov_b32 s8, s4 +; SI-NOFMA-NEXT: s_mov_b32 s9, s5 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) ; SI-NOFMA-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_y_sub_x_negone: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s2, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s2 +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s10, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s10 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-FMA-NEXT: s_mov_b32 s12, s6 ; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s3 -; SI-FMA-NEXT: s_mov_b32 s10, s2 -; SI-FMA-NEXT: s_mov_b32 s11, s3 +; SI-FMA-NEXT: s_mov_b32 s15, s11 +; SI-FMA-NEXT: s_mov_b32 s2, s10 +; SI-FMA-NEXT: s_mov_b32 s3, s11 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 -; SI-FMA-NEXT: s_mov_b32 s0, s4 -; SI-FMA-NEXT: s_mov_b32 s1, s5 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; SI-FMA-NEXT: s_mov_b32 s8, s4 +; SI-FMA-NEXT: s_mov_b32 s9, s5 ; SI-FMA-NEXT: s_waitcnt vmcnt(0) ; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_y_sub_x_negone: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x1 @@ -1989,8 +2017,8 @@ define amdgpu_kernel void @test_f32_mul_y_sub_x_negone(ptr addrspace(1) %out, ; GFX11-FMA-LABEL: test_f32_mul_y_sub_x_negone: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -2019,7 +2047,7 @@ define amdgpu_kernel void @test_f32_mul_y_sub_x_negone(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_interp(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_interp: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 ; SI-NOFMA-NEXT: s_mov_b32 s10, -1 ; SI-NOFMA-NEXT: s_mov_b32 s14, s10 @@ -2051,7 +2079,7 @@ define amdgpu_kernel void @test_f32_interp(ptr addrspace(1) %out, ; ; SI-FMA-LABEL: test_f32_interp: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-FMA-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 ; SI-FMA-NEXT: s_mov_b32 s10, -1 ; SI-FMA-NEXT: s_mov_b32 s18, s10 @@ -2081,7 +2109,7 @@ define amdgpu_kernel void @test_f32_interp(ptr addrspace(1) %out, ; ; GFX11-NOFMA-LABEL: test_f32_interp: ; GFX11-NOFMA: ; %bb.0: -; GFX11-NOFMA-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NOFMA-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x2 @@ -2102,7 +2130,7 @@ define amdgpu_kernel void @test_f32_interp(ptr addrspace(1) %out, ; ; GFX11-FMA-LABEL: test_f32_interp: ; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x2 @@ -2135,7 +2163,7 @@ define amdgpu_kernel void @test_f32_interp(ptr addrspace(1) %out, define amdgpu_kernel void @test_f64_interp(ptr addrspace(1) %out, ; SI-FMA-LABEL: test_f64_interp: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-FMA-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 ; SI-FMA-NEXT: s_mov_b32 s10, -1 ; SI-FMA-NEXT: s_mov_b32 s18, s10 @@ -2165,7 +2193,7 @@ define amdgpu_kernel void @test_f64_interp(ptr addrspace(1) %out, ; ; GFX11-NOFMA-LABEL: test_f64_interp: ; GFX11-NOFMA: ; %bb.0: -; GFX11-NOFMA-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NOFMA-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v8, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x2 @@ -2186,7 +2214,7 @@ define amdgpu_kernel void @test_f64_interp(ptr addrspace(1) %out, ; ; GFX11-FMA-LABEL: test_f64_interp: ; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX11-FMA-NEXT: v_mov_b32_e32 v6, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x2 @@ -2220,7 +2248,7 @@ define amdgpu_kernel void @test_f64_interp(ptr addrspace(1) %out, define amdgpu_kernel void @fma_neg_2.0_neg_a_b_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: fma_neg_2.0_neg_a_b_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2236,7 +2264,9 @@ define amdgpu_kernel void @fma_neg_2.0_neg_a_b_f32(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: fma_neg_2.0_neg_a_b_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc @@ -2266,7 +2296,7 @@ define amdgpu_kernel void @fma_neg_2.0_neg_a_b_f32(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @fma_2.0_neg_a_b_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: fma_2.0_neg_a_b_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2282,7 +2312,9 @@ define amdgpu_kernel void @fma_2.0_neg_a_b_f32(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-LABEL: fma_2.0_neg_a_b_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc @@ -2312,7 +2344,7 @@ define amdgpu_kernel void @fma_2.0_neg_a_b_f32(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @fma_neg_b_c_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #2 { ; SI-LABEL: fma_neg_b_c_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v12, 4, v0 @@ -2333,7 +2365,9 @@ define amdgpu_kernel void @fma_neg_b_c_v4f32(ptr addrspace(1) %out, ptr addrspac ; ; GFX11-LABEL: fma_neg_b_c_v4f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v12, 4, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x2 diff --git a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll index 7830c91851bfa..c60b9858abd83 100644 --- a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll +++ b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll @@ -15,7 +15,7 @@ declare float @llvm.fabs.f32(float) #1 define amdgpu_kernel void @multiple_fadd_use_test_f32(ptr addrspace(1) %out, float %x, float %y, float %z) #0 { ; VI-LABEL: multiple_fadd_use_test_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_f32_e64 v0, s3, -1.0 ; VI-NEXT: v_add_f32_e64 v1, s2, -1.0 @@ -31,7 +31,7 @@ define amdgpu_kernel void @multiple_fadd_use_test_f32(ptr addrspace(1) %out, flo ; ; GFX10-LABEL: multiple_fadd_use_test_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_f32_e64 v0, s3, -1.0 @@ -46,7 +46,7 @@ define amdgpu_kernel void @multiple_fadd_use_test_f32(ptr addrspace(1) %out, flo ; ; GFX11-LABEL: multiple_fadd_use_test_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_f32_e64 v0, s3, -1.0 @@ -79,20 +79,20 @@ define amdgpu_kernel void @multiple_fadd_use_test_f32(ptr addrspace(1) %out, flo define amdgpu_kernel void @multiple_use_fadd_fmac_f32(ptr addrspace(1) %out, float %x, [8 x i32], float %y) #0 { ; VI-LABEL: multiple_use_fadd_fmac_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dword s6, s[4:5], 0x8 -; VI-NEXT: s_load_dword s3, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s4, s[6:7], 0x8 +; VI-NEXT: s_load_dword s3, s[6:7], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_add_u32 s2, s0, 4 -; VI-NEXT: v_add_f32_e64 v2, s6, s6 +; VI-NEXT: v_add_f32_e64 v2, s4, s4 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mac_f32_e64 v3, s6, 2.0 +; VI-NEXT: v_mac_f32_e64 v3, s4, 2.0 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v3 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -101,9 +101,9 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f32(ptr addrspace(1) %out, flo ; GFX10-LABEL: multiple_use_fadd_fmac_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-NEXT: s_load_dword s3, s[4:5], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX10-NEXT: s_load_dword s3, s[6:7], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_f32_e64 v1, s2, s2 @@ -117,13 +117,13 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f32(ptr addrspace(1) %out, flo ; GFX11-LABEL: multiple_use_fadd_fmac_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_add_f32_e64 v1, s2, s2 -; GFX11-NEXT: v_fma_f32 v2, s2, 2.0, s3 +; GFX11-NEXT: v_add_f32_e64 v1, s4, s4 +; GFX11-NEXT: v_fma_f32 v2, s4, 2.0, s5 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] offset:4 dlc @@ -142,7 +142,7 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f32(ptr addrspace(1) %out, flo define amdgpu_kernel void @multiple_use_fadd_fmad_f32(ptr addrspace(1) %out, float %x, float %y) #0 { ; VI-LABEL: multiple_use_fadd_fmad_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_add_u32 s4, s0, 4 @@ -161,7 +161,7 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f32(ptr addrspace(1) %out, flo ; ; GFX10-LABEL: multiple_use_fadd_fmad_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_f32_e64 v1, |s2|, |s2| @@ -174,7 +174,7 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f32(ptr addrspace(1) %out, flo ; ; GFX11-LABEL: multiple_use_fadd_fmad_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_f32_e64 v1, |s2|, |s2| @@ -198,21 +198,21 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f32(ptr addrspace(1) %out, flo define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f32(ptr addrspace(1) %out, float %x, float %y, float %z) #0 { ; VI-LABEL: multiple_use_fadd_multi_fmad_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s4, s6, 4 +; VI-NEXT: s_add_u32 s6, s4, 4 ; VI-NEXT: v_mov_b32_e32 v0, s1 ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: v_mad_f32 v2, |s0|, 2.0, v0 ; VI-NEXT: v_mad_f32 v3, |s0|, 2.0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: s_addc_u32 s5, s7, 0 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_addc_u32 s7, s5, 0 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: flat_store_dword v[0:1], v3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_endpgm @@ -220,23 +220,23 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f32(ptr addrspace(1) %ou ; GFX10-LABEL: multiple_use_fadd_multi_fmad_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_fma_f32 v1, |s0|, 2.0, s1 ; GFX10-NEXT: v_fma_f32 v2, |s0|, 2.0, s2 -; GFX10-NEXT: global_store_dword v0, v1, s[6:7] +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_store_dword v0, v2, s[6:7] offset:4 +; GFX10-NEXT: global_store_dword v0, v2, s[4:5] offset:4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: multiple_use_fadd_multi_fmad_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_fma_f32 v1, |s4|, 2.0, s5 @@ -261,8 +261,8 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f32(ptr addrspace(1) %ou define amdgpu_kernel void @fmul_x2_xn2_f32(ptr addrspace(1) %out, float %x, float %y) #0 { ; VI-LABEL: fmul_x2_xn2_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mul_f32_e64 v0, s2, -4.0 ; VI-NEXT: v_mul_f32_e32 v2, s2, v0 @@ -275,8 +275,8 @@ define amdgpu_kernel void @fmul_x2_xn2_f32(ptr addrspace(1) %out, float %x, floa ; GFX10-LABEL: fmul_x2_xn2_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mul_f32_e64 v0, s2, -4.0 @@ -288,12 +288,12 @@ define amdgpu_kernel void @fmul_x2_xn2_f32(ptr addrspace(1) %out, float %x, floa ; GFX11-LABEL: fmul_x2_xn2_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mul_f32_e64 v0, s2, -4.0 +; GFX11-NEXT: v_mul_f32_e64 v0, s4, -4.0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, s2, v0 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, s4, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_nop 0 @@ -310,8 +310,8 @@ define amdgpu_kernel void @fmul_x2_xn2_f32(ptr addrspace(1) %out, float %x, floa define amdgpu_kernel void @fmul_x2_xn3_f32(ptr addrspace(1) %out, float %x, float %y) #0 { ; VI-LABEL: fmul_x2_xn3_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0xc0c00000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mul_f32_e32 v0, s2, v0 @@ -325,8 +325,8 @@ define amdgpu_kernel void @fmul_x2_xn3_f32(ptr addrspace(1) %out, float %x, floa ; GFX10-LABEL: fmul_x2_xn3_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mul_f32_e64 v0, 0xc0c00000, s2 @@ -338,12 +338,12 @@ define amdgpu_kernel void @fmul_x2_xn3_f32(ptr addrspace(1) %out, float %x, floa ; GFX11-LABEL: fmul_x2_xn3_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mul_f32_e64 v0, 0xc0c00000, s2 +; GFX11-NEXT: v_mul_f32_e64 v0, 0xc0c00000, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, s2, v0 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, s4, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_nop 0 @@ -360,8 +360,8 @@ define amdgpu_kernel void @fmul_x2_xn3_f32(ptr addrspace(1) %out, float %x, floa define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16 zeroext %x.arg, i16 zeroext %y.arg, i16 zeroext %z.arg) #0 { ; VI-DENORM-LABEL: multiple_fadd_use_test_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-DENORM-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: s_lshr_b32 s3, s2, 16 ; VI-DENORM-NEXT: v_add_f16_e64 v0, s2, -1.0 @@ -378,8 +378,8 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16 ; ; VI-FLUSH-LABEL: multiple_fadd_use_test_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-FLUSH-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: s_lshr_b32 s3, s2, 16 ; VI-FLUSH-NEXT: v_add_f16_e64 v0, s2, -1.0 @@ -396,13 +396,13 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16 ; ; GFX10-DENORM-LABEL: multiple_fadd_use_test_f16: ; GFX10-DENORM: ; %bb.0: -; GFX10-DENORM-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX10-DENORM-NEXT: s_load_dword s0, s[6:7], 0x8 ; GFX10-DENORM-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: s_lshr_b32 s1, s0, 16 ; GFX10-DENORM-NEXT: v_add_f16_e64 v0, s0, -1.0 ; GFX10-DENORM-NEXT: v_add_f16_e64 v1, s1, -1.0 -; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-DENORM-NEXT: v_cmp_gt_f16_e64 vcc_lo, |v1|, |v0| ; GFX10-DENORM-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX10-DENORM-NEXT: v_add_f16_e64 v0, |v0|, |v0| @@ -414,12 +414,12 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16 ; ; GFX10-FLUSH-LABEL: multiple_fadd_use_test_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX10-FLUSH-NEXT: s_load_dword s0, s[6:7], 0x8 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: s_lshr_b32 s1, s0, 16 ; GFX10-FLUSH-NEXT: v_add_f16_e64 v0, s0, -1.0 ; GFX10-FLUSH-NEXT: v_add_f16_e64 v1, s1, -1.0 -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-FLUSH-NEXT: v_cmp_gt_f16_e64 vcc_lo, |v1|, |v0| ; GFX10-FLUSH-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX10-FLUSH-NEXT: v_add_f16_e64 v0, |v0|, |v0| @@ -433,14 +433,13 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16 ; ; GFX11-DENORM-LABEL: multiple_fadd_use_test_f16: ; GFX11-DENORM: ; %bb.0: -; GFX11-DENORM-NEXT: s_clause 0x1 -; GFX11-DENORM-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-DENORM-NEXT: s_load_b32 s0, s[2:3], 0x8 ; GFX11-DENORM-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-DENORM-NEXT: s_lshr_b32 s3, s2, 16 -; GFX11-DENORM-NEXT: v_add_f16_e64 v0, s2, -1.0 -; GFX11-DENORM-NEXT: v_add_f16_e64 v1, s3, -1.0 +; GFX11-DENORM-NEXT: s_lshr_b32 s1, s0, 16 +; GFX11-DENORM-NEXT: v_add_f16_e64 v0, s0, -1.0 +; GFX11-DENORM-NEXT: v_add_f16_e64 v1, s1, -1.0 +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-DENORM-NEXT: v_cmp_gt_f16_e64 vcc_lo, |v1|, |v0| ; GFX11-DENORM-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo @@ -448,6 +447,7 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16 ; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-DENORM-NEXT: v_mul_f16_e32 v1, v0, v0 ; GFX11-DENORM-NEXT: v_fma_f16 v0, -v1, v0, 1.0 +; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-NEXT: global_store_b16 v2, v0, s[0:1] ; GFX11-DENORM-NEXT: s_nop 0 ; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -455,13 +455,12 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16 ; ; GFX11-FLUSH-LABEL: multiple_fadd_use_test_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_clause 0x1 -; GFX11-FLUSH-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-FLUSH-NEXT: s_load_b32 s0, s[2:3], 0x8 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FLUSH-NEXT: s_lshr_b32 s3, s2, 16 -; GFX11-FLUSH-NEXT: v_add_f16_e64 v0, s2, -1.0 -; GFX11-FLUSH-NEXT: v_add_f16_e64 v1, s3, -1.0 +; GFX11-FLUSH-NEXT: s_lshr_b32 s1, s0, 16 +; GFX11-FLUSH-NEXT: v_add_f16_e64 v0, s0, -1.0 +; GFX11-FLUSH-NEXT: v_add_f16_e64 v1, s1, -1.0 +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_cmp_gt_f16_e64 vcc_lo, |v1|, |v0| ; GFX11-FLUSH-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo @@ -472,6 +471,7 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16 ; GFX11-FLUSH-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-FLUSH-NEXT: v_sub_f16_e32 v0, 1.0, v0 +; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_store_b16 v1, v0, s[0:1] ; GFX11-FLUSH-NEXT: s_nop 0 ; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -496,14 +496,14 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16 define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 { ; VI-DENORM-LABEL: multiple_use_fadd_fmac_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dword s6, s[4:5], 0x8 -; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-DENORM-NEXT: s_load_dword s4, s[6:7], 0x8 +; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; VI-DENORM-NEXT: s_lshr_b32 s3, s6, 16 +; VI-DENORM-NEXT: s_lshr_b32 s3, s4, 16 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s3 -; VI-DENORM-NEXT: v_fma_f16 v3, s6, 2.0, v0 +; VI-DENORM-NEXT: v_fma_f16 v3, s4, 2.0, v0 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s0 -; VI-DENORM-NEXT: v_add_f16_e64 v2, s6, s6 +; VI-DENORM-NEXT: v_add_f16_e64 v2, s4, s4 ; VI-DENORM-NEXT: s_add_u32 s2, s0, 2 ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1 ; VI-DENORM-NEXT: s_addc_u32 s3, s1, 0 @@ -517,12 +517,12 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 ; ; VI-FLUSH-LABEL: multiple_use_fadd_fmac_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dword s6, s[4:5], 0x8 -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-FLUSH-NEXT: s_load_dword s4, s[6:7], 0x8 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; VI-FLUSH-NEXT: s_lshr_b32 s3, s6, 16 +; VI-FLUSH-NEXT: s_lshr_b32 s3, s4, 16 ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s0 -; VI-FLUSH-NEXT: v_add_f16_e64 v2, s6, s6 +; VI-FLUSH-NEXT: v_add_f16_e64 v2, s4, s4 ; VI-FLUSH-NEXT: s_add_u32 s2, s0, 2 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 ; VI-FLUSH-NEXT: v_mov_b32_e32 v3, s3 @@ -530,7 +530,7 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 ; VI-FLUSH-NEXT: flat_store_short v[0:1], v2 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s2 -; VI-FLUSH-NEXT: v_mac_f16_e64 v3, s6, 2.0 +; VI-FLUSH-NEXT: v_mac_f16_e64 v3, s4, 2.0 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 ; VI-FLUSH-NEXT: flat_store_short v[0:1], v3 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) @@ -539,8 +539,8 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 ; GFX10-DENORM-LABEL: multiple_use_fadd_fmac_f16: ; GFX10-DENORM: ; %bb.0: ; GFX10-DENORM-NEXT: s_clause 0x1 -; GFX10-DENORM-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-DENORM-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: s_lshr_b32 s3, s2, 16 @@ -555,8 +555,8 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 ; GFX10-FLUSH-LABEL: multiple_use_fadd_fmac_f16: ; GFX10-FLUSH: ; %bb.0: ; GFX10-FLUSH-NEXT: s_clause 0x1 -; GFX10-FLUSH-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-FLUSH-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-FLUSH-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: v_add_f16_e64 v0, s2, s2 @@ -571,13 +571,13 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 ; GFX11-DENORM-LABEL: multiple_use_fadd_fmac_f16: ; GFX11-DENORM: ; %bb.0: ; GFX11-DENORM-NEXT: s_clause 0x1 -; GFX11-DENORM-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-DENORM-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-DENORM-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-DENORM-NEXT: s_lshr_b32 s3, s2, 16 -; GFX11-DENORM-NEXT: v_add_f16_e64 v1, s2, s2 -; GFX11-DENORM-NEXT: v_fma_f16 v2, s2, 2.0, s3 +; GFX11-DENORM-NEXT: s_lshr_b32 s2, s4, 16 +; GFX11-DENORM-NEXT: v_add_f16_e64 v1, s4, s4 +; GFX11-DENORM-NEXT: v_fma_f16 v2, s4, 2.0, s2 ; GFX11-DENORM-NEXT: global_store_b16 v0, v1, s[0:1] dlc ; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[0:1] offset:2 dlc @@ -589,12 +589,12 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 ; GFX11-FLUSH-LABEL: multiple_use_fadd_fmac_f16: ; GFX11-FLUSH: ; %bb.0: ; GFX11-FLUSH-NEXT: s_clause 0x1 -; GFX11-FLUSH-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-FLUSH-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-FLUSH-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FLUSH-NEXT: v_add_f16_e64 v0, s2, s2 -; GFX11-FLUSH-NEXT: s_lshr_b32 s2, s2, 16 +; GFX11-FLUSH-NEXT: v_add_f16_e64 v0, s4, s4 +; GFX11-FLUSH-NEXT: s_lshr_b32 s2, s4, 16 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-FLUSH-NEXT: v_add_f16_e32 v2, s2, v0 ; GFX11-FLUSH-NEXT: global_store_b16 v1, v0, s[0:1] dlc @@ -617,14 +617,14 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 { ; VI-DENORM-LABEL: multiple_use_fadd_fmad_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dword s6, s[4:5], 0x8 -; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-DENORM-NEXT: s_load_dword s4, s[6:7], 0x8 +; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; VI-DENORM-NEXT: s_lshr_b32 s3, s6, 16 +; VI-DENORM-NEXT: s_lshr_b32 s3, s4, 16 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s3 -; VI-DENORM-NEXT: v_fma_f16 v3, |s6|, 2.0, v0 +; VI-DENORM-NEXT: v_fma_f16 v3, |s4|, 2.0, v0 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s0 -; VI-DENORM-NEXT: v_add_f16_e64 v2, |s6|, |s6| +; VI-DENORM-NEXT: v_add_f16_e64 v2, |s4|, |s4| ; VI-DENORM-NEXT: s_add_u32 s2, s0, 2 ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1 ; VI-DENORM-NEXT: s_addc_u32 s3, s1, 0 @@ -638,14 +638,14 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16 ; ; VI-FLUSH-LABEL: multiple_use_fadd_fmad_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dword s6, s[4:5], 0x8 -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-FLUSH-NEXT: s_load_dword s4, s[6:7], 0x8 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; VI-FLUSH-NEXT: s_lshr_b32 s3, s6, 16 +; VI-FLUSH-NEXT: s_lshr_b32 s3, s4, 16 ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s3 -; VI-FLUSH-NEXT: v_mad_f16 v3, |s6|, 2.0, v0 +; VI-FLUSH-NEXT: v_mad_f16 v3, |s4|, 2.0, v0 ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s0 -; VI-FLUSH-NEXT: v_add_f16_e64 v2, |s6|, |s6| +; VI-FLUSH-NEXT: v_add_f16_e64 v2, |s4|, |s4| ; VI-FLUSH-NEXT: s_add_u32 s2, s0, 2 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 ; VI-FLUSH-NEXT: s_addc_u32 s3, s1, 0 @@ -660,8 +660,8 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16 ; GFX10-DENORM-LABEL: multiple_use_fadd_fmad_f16: ; GFX10-DENORM: ; %bb.0: ; GFX10-DENORM-NEXT: s_clause 0x1 -; GFX10-DENORM-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-DENORM-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: s_lshr_b32 s3, s2, 16 @@ -676,8 +676,8 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16 ; GFX10-FLUSH-LABEL: multiple_use_fadd_fmad_f16: ; GFX10-FLUSH: ; %bb.0: ; GFX10-FLUSH-NEXT: s_clause 0x1 -; GFX10-FLUSH-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-FLUSH-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-FLUSH-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: v_add_f16_e64 v0, |s2|, |s2| @@ -692,13 +692,13 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16 ; GFX11-DENORM-LABEL: multiple_use_fadd_fmad_f16: ; GFX11-DENORM: ; %bb.0: ; GFX11-DENORM-NEXT: s_clause 0x1 -; GFX11-DENORM-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-DENORM-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-DENORM-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-DENORM-NEXT: s_lshr_b32 s3, s2, 16 -; GFX11-DENORM-NEXT: v_add_f16_e64 v1, |s2|, |s2| -; GFX11-DENORM-NEXT: v_fma_f16 v2, |s2|, 2.0, s3 +; GFX11-DENORM-NEXT: s_lshr_b32 s2, s4, 16 +; GFX11-DENORM-NEXT: v_add_f16_e64 v1, |s4|, |s4| +; GFX11-DENORM-NEXT: v_fma_f16 v2, |s4|, 2.0, s2 ; GFX11-DENORM-NEXT: global_store_b16 v0, v1, s[0:1] dlc ; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[0:1] offset:2 dlc @@ -710,12 +710,12 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16 ; GFX11-FLUSH-LABEL: multiple_use_fadd_fmad_f16: ; GFX11-FLUSH: ; %bb.0: ; GFX11-FLUSH-NEXT: s_clause 0x1 -; GFX11-FLUSH-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-FLUSH-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-FLUSH-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FLUSH-NEXT: v_add_f16_e64 v0, |s2|, |s2| -; GFX11-FLUSH-NEXT: s_lshr_b32 s2, s2, 16 +; GFX11-FLUSH-NEXT: v_add_f16_e64 v0, |s4|, |s4| +; GFX11-FLUSH-NEXT: s_lshr_b32 s2, s4, 16 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-FLUSH-NEXT: v_add_f16_e32 v2, s2, v0 ; GFX11-FLUSH-NEXT: global_store_b16 v1, v0, s[0:1] dlc @@ -739,9 +739,9 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16 define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %out, i16 zeroext %x.arg, i16 zeroext %y.arg, i16 zeroext %z.arg) #0 { ; VI-DENORM-LABEL: multiple_use_fadd_multi_fmad_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; VI-DENORM-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; VI-DENORM-NEXT: s_load_dword s6, s[4:5], 0x8 +; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; VI-DENORM-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 +; VI-DENORM-NEXT: s_load_dword s6, s[6:7], 0x8 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: s_lshr_b32 s0, s0, 16 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s0 @@ -762,9 +762,9 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou ; ; VI-FLUSH-LABEL: multiple_use_fadd_multi_fmad_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; VI-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; VI-FLUSH-NEXT: s_load_dword s6, s[4:5], 0x8 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; VI-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 +; VI-FLUSH-NEXT: s_load_dword s6, s[6:7], 0x8 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: s_lshr_b32 s0, s0, 16 ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s0 @@ -786,14 +786,14 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou ; GFX10-DENORM-LABEL: multiple_use_fadd_multi_fmad_f16: ; GFX10-DENORM: ; %bb.0: ; GFX10-DENORM-NEXT: s_clause 0x2 -; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-DENORM-NEXT: s_load_dword s6, s[4:5], 0x8 -; GFX10-DENORM-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; GFX10-DENORM-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 ; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: s_lshr_b32 s0, s0, 16 -; GFX10-DENORM-NEXT: v_fma_f16 v2, |s6|, 2.0, s1 -; GFX10-DENORM-NEXT: v_fma_f16 v1, |s6|, 2.0, s0 +; GFX10-DENORM-NEXT: v_fma_f16 v2, |s4|, 2.0, s1 +; GFX10-DENORM-NEXT: v_fma_f16 v1, |s4|, 2.0, s0 ; GFX10-DENORM-NEXT: global_store_short v0, v1, s[2:3] ; GFX10-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-DENORM-NEXT: global_store_short v0, v2, s[2:3] offset:2 @@ -803,12 +803,12 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou ; GFX10-FLUSH-LABEL: multiple_use_fadd_multi_fmad_f16: ; GFX10-FLUSH: ; %bb.0: ; GFX10-FLUSH-NEXT: s_clause 0x2 -; GFX10-FLUSH-NEXT: s_load_dword s6, s[4:5], 0x8 -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-FLUSH-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 ; GFX10-FLUSH-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-FLUSH-NEXT: v_add_f16_e64 v0, |s6|, |s6| +; GFX10-FLUSH-NEXT: v_add_f16_e64 v0, |s4|, |s4| ; GFX10-FLUSH-NEXT: s_lshr_b32 s0, s0, 16 ; GFX10-FLUSH-NEXT: v_add_f16_e32 v2, s0, v0 ; GFX10-FLUSH-NEXT: v_add_f16_e32 v0, s1, v0 @@ -821,17 +821,17 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou ; GFX11-DENORM-LABEL: multiple_use_fadd_multi_fmad_f16: ; GFX11-DENORM: ; %bb.0: ; GFX11-DENORM-NEXT: s_clause 0x2 -; GFX11-DENORM-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-DENORM-NEXT: s_load_b32 s4, s[0:1], 0x8 -; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 +; GFX11-DENORM-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-DENORM-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX11-DENORM-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-DENORM-NEXT: s_lshr_b32 s2, s2, 16 -; GFX11-DENORM-NEXT: v_fma_f16 v2, |s4|, 2.0, s3 -; GFX11-DENORM-NEXT: v_fma_f16 v1, |s4|, 2.0, s2 -; GFX11-DENORM-NEXT: global_store_b16 v0, v1, s[0:1] dlc +; GFX11-DENORM-NEXT: s_lshr_b32 s0, s0, 16 +; GFX11-DENORM-NEXT: v_fma_f16 v2, |s4|, 2.0, s1 +; GFX11-DENORM-NEXT: v_fma_f16 v1, |s4|, 2.0, s0 +; GFX11-DENORM-NEXT: global_store_b16 v0, v1, s[2:3] dlc ; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[0:1] offset:2 dlc +; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[2:3] offset:2 dlc ; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-DENORM-NEXT: s_nop 0 ; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -840,19 +840,19 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou ; GFX11-FLUSH-LABEL: multiple_use_fadd_multi_fmad_f16: ; GFX11-FLUSH: ; %bb.0: ; GFX11-FLUSH-NEXT: s_clause 0x2 -; GFX11-FLUSH-NEXT: s_load_b32 s4, s[0:1], 0x8 -; GFX11-FLUSH-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-FLUSH-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 +; GFX11-FLUSH-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX11-FLUSH-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: v_add_f16_e64 v0, |s4|, |s4| -; GFX11-FLUSH-NEXT: s_lshr_b32 s2, s2, 16 +; GFX11-FLUSH-NEXT: s_lshr_b32 s0, s0, 16 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-FLUSH-NEXT: v_add_f16_e32 v2, s2, v0 -; GFX11-FLUSH-NEXT: v_add_f16_e32 v0, s3, v0 -; GFX11-FLUSH-NEXT: global_store_b16 v1, v2, s[0:1] dlc +; GFX11-FLUSH-NEXT: v_add_f16_e32 v2, s0, v0 +; GFX11-FLUSH-NEXT: v_add_f16_e32 v0, s1, v0 +; GFX11-FLUSH-NEXT: global_store_b16 v1, v2, s[2:3] dlc ; GFX11-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FLUSH-NEXT: global_store_b16 v1, v0, s[0:1] offset:2 dlc +; GFX11-FLUSH-NEXT: global_store_b16 v1, v0, s[2:3] offset:2 dlc ; GFX11-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FLUSH-NEXT: s_nop 0 ; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -873,8 +873,8 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou define amdgpu_kernel void @fmul_x2_xn2_f16(ptr addrspace(1) %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 { ; VI-LABEL: fmul_x2_xn2_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mul_f16_e64 v0, s2, -4.0 ; VI-NEXT: v_mul_f16_e32 v2, s2, v0 @@ -887,8 +887,8 @@ define amdgpu_kernel void @fmul_x2_xn2_f16(ptr addrspace(1) %out, i16 zeroext %x ; GFX10-LABEL: fmul_x2_xn2_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mul_f16_e64 v0, s2, -4.0 @@ -900,13 +900,13 @@ define amdgpu_kernel void @fmul_x2_xn2_f16(ptr addrspace(1) %out, i16 zeroext %x ; GFX11-LABEL: fmul_x2_xn2_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mul_f16_e64 v0, s2, -4.0 +; GFX11-NEXT: v_mul_f16_e64 v0, s4, -4.0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mul_f16_e32 v0, s2, v0 +; GFX11-NEXT: v_mul_f16_e32 v0, s4, v0 ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_nop 0 @@ -925,8 +925,8 @@ define amdgpu_kernel void @fmul_x2_xn2_f16(ptr addrspace(1) %out, i16 zeroext %x define amdgpu_kernel void @fmul_x2_xn3_f16(ptr addrspace(1) %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 { ; VI-LABEL: fmul_x2_xn3_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0xc600 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mul_f16_e32 v0, s2, v0 @@ -940,8 +940,8 @@ define amdgpu_kernel void @fmul_x2_xn3_f16(ptr addrspace(1) %out, i16 zeroext %x ; GFX10-LABEL: fmul_x2_xn3_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mul_f16_e64 v0, 0xc600, s2 @@ -953,13 +953,13 @@ define amdgpu_kernel void @fmul_x2_xn3_f16(ptr addrspace(1) %out, i16 zeroext %x ; GFX11-LABEL: fmul_x2_xn3_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mul_f16_e64 v0, 0xc600, s2 +; GFX11-NEXT: v_mul_f16_e64 v0, 0xc600, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mul_f16_e32 v0, s2, v0 +; GFX11-NEXT: v_mul_f16_e32 v0, s4, v0 ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_nop 0 diff --git a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll index 9943976dd86da..70cdfeb6d4954 100644 --- a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll @@ -22,7 +22,7 @@ declare half @llvm.fabs.f16(half) #1 define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; VI-FLUSH-LABEL: fmuladd_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s2 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 @@ -42,7 +42,7 @@ define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-DENORM-LABEL: fmuladd_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-DENORM-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s2 ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s3 @@ -62,7 +62,7 @@ define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX10-FLUSH-LABEL: fmuladd_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX10-FLUSH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: s_clause 0x2 @@ -78,7 +78,7 @@ define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX10-DENORM-LABEL: fmuladd_f16: ; GFX10-DENORM: ; %bb.0: -; GFX10-DENORM-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10-DENORM-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: s_clause 0x2 @@ -92,7 +92,7 @@ define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX11-FLUSH-LABEL: fmuladd_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX11-FLUSH-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: s_clause 0x2 @@ -111,7 +111,7 @@ define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX11-DENORM-LABEL: fmuladd_f16: ; GFX11-DENORM: ; %bb.0: -; GFX11-DENORM-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-DENORM-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX11-DENORM-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-NEXT: s_clause 0x2 @@ -136,7 +136,7 @@ define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; VI-FLUSH-LABEL: fmul_fadd_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s2 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 @@ -156,7 +156,7 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; VI-DENORM-CONTRACT-LABEL: fmul_fadd_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v0, s2 ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3 @@ -176,7 +176,7 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX10-FLUSH-LABEL: fmul_fadd_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX10-FLUSH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: s_clause 0x2 @@ -192,7 +192,7 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX10-DENORM-STRICT-LABEL: fmul_fadd_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-STRICT-NEXT: s_clause 0x2 @@ -208,7 +208,7 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX10-DENORM-CONTRACT-LABEL: fmul_fadd_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: s_clause 0x2 @@ -222,7 +222,7 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX11-FLUSH-LABEL: fmul_fadd_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX11-FLUSH-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: s_clause 0x2 @@ -241,7 +241,7 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX11-DENORM-STRICT-LABEL: fmul_fadd_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-DENORM-STRICT-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX11-DENORM-STRICT-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-STRICT-NEXT: s_clause 0x2 @@ -260,7 +260,7 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX11-DENORM-CONTRACT-LABEL: fmul_fadd_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX11-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: s_clause 0x2 @@ -286,7 +286,7 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; VI-FLUSH-LABEL: fmul_fadd_contract_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s2 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 @@ -306,7 +306,7 @@ define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr add ; ; VI-DENORM-LABEL: fmul_fadd_contract_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-DENORM-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s2 ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s3 @@ -326,7 +326,7 @@ define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr add ; ; GFX10-FLUSH-LABEL: fmul_fadd_contract_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX10-FLUSH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: s_clause 0x2 @@ -342,7 +342,7 @@ define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr add ; ; GFX10-DENORM-LABEL: fmul_fadd_contract_f16: ; GFX10-DENORM: ; %bb.0: -; GFX10-DENORM-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10-DENORM-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: s_clause 0x2 @@ -356,7 +356,7 @@ define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr add ; ; GFX11-FLUSH-LABEL: fmul_fadd_contract_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX11-FLUSH-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: s_clause 0x2 @@ -375,7 +375,7 @@ define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr add ; ; GFX11-DENORM-LABEL: fmul_fadd_contract_f16: ; GFX11-DENORM: ; %bb.0: -; GFX11-DENORM-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-DENORM-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX11-DENORM-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-NEXT: s_clause 0x2 @@ -401,7 +401,7 @@ define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; VI-FLUSH-LABEL: fmuladd_2.0_a_b_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 @@ -419,7 +419,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrsp ; ; VI-DENORM-LABEL: fmuladd_2.0_a_b_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1 @@ -437,7 +437,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-FLUSH-LABEL: fmuladd_2.0_a_b_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -451,7 +451,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-DENORM-LABEL: fmuladd_2.0_a_b_f16: ; GFX10-DENORM: ; %bb.0: -; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -464,7 +464,9 @@ define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-FLUSH-LABEL: fmuladd_2.0_a_b_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -481,7 +483,9 @@ define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-DENORM-LABEL: fmuladd_2.0_a_b_f16: ; GFX11-DENORM: ; %bb.0: -; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-DENORM-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -509,7 +513,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; VI-FLUSH-LABEL: fmuladd_a_2.0_b_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 @@ -527,7 +531,7 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrsp ; ; VI-DENORM-LABEL: fmuladd_a_2.0_b_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1 @@ -545,7 +549,7 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-FLUSH-LABEL: fmuladd_a_2.0_b_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -559,7 +563,7 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-DENORM-LABEL: fmuladd_a_2.0_b_f16: ; GFX10-DENORM: ; %bb.0: -; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -572,7 +576,9 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-FLUSH-LABEL: fmuladd_a_2.0_b_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -589,7 +595,9 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-DENORM-LABEL: fmuladd_a_2.0_b_f16: ; GFX11-DENORM: ; %bb.0: -; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-DENORM-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -617,7 +625,7 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out, ; VI-FLUSH-LABEL: fadd_a_a_b_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 @@ -635,7 +643,7 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out, ; ; VI-DENORM-CONTRACT-LABEL: fadd_a_a_b_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s1 @@ -653,7 +661,7 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out, ; ; GFX10-FLUSH-LABEL: fadd_a_a_b_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -667,7 +675,7 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out, ; ; GFX10-DENORM-STRICT-LABEL: fadd_a_a_b_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -681,7 +689,7 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out, ; ; GFX10-DENORM-CONTRACT-LABEL: fadd_a_a_b_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -694,7 +702,9 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out, ; ; GFX11-FLUSH-LABEL: fadd_a_a_b_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -711,7 +721,9 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out, ; ; GFX11-DENORM-STRICT-LABEL: fadd_a_a_b_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -728,7 +740,9 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out, ; ; GFX11-DENORM-CONTRACT-LABEL: fadd_a_a_b_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -759,7 +773,7 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out, define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out, ; VI-FLUSH-LABEL: fadd_b_a_a_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 @@ -777,7 +791,7 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out, ; ; VI-DENORM-CONTRACT-LABEL: fadd_b_a_a_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s1 @@ -795,7 +809,7 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out, ; ; GFX10-FLUSH-LABEL: fadd_b_a_a_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -809,7 +823,7 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out, ; ; GFX10-DENORM-STRICT-LABEL: fadd_b_a_a_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -823,7 +837,7 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out, ; ; GFX10-DENORM-CONTRACT-LABEL: fadd_b_a_a_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -836,7 +850,9 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out, ; ; GFX11-FLUSH-LABEL: fadd_b_a_a_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -853,7 +869,9 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out, ; ; GFX11-DENORM-STRICT-LABEL: fadd_b_a_a_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -870,7 +888,9 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out, ; ; GFX11-DENORM-CONTRACT-LABEL: fadd_b_a_a_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -901,7 +921,7 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out, define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; VI-FLUSH-LABEL: fmuladd_neg_2.0_a_b_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 @@ -919,7 +939,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr ad ; ; VI-DENORM-LABEL: fmuladd_neg_2.0_a_b_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1 @@ -937,7 +957,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX10-FLUSH-LABEL: fmuladd_neg_2.0_a_b_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -951,7 +971,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX10-DENORM-LABEL: fmuladd_neg_2.0_a_b_f16: ; GFX10-DENORM: ; %bb.0: -; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -964,7 +984,9 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX11-FLUSH-LABEL: fmuladd_neg_2.0_a_b_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -981,7 +1003,9 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX11-DENORM-LABEL: fmuladd_neg_2.0_a_b_f16: ; GFX11-DENORM: ; %bb.0: -; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-DENORM-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -1009,7 +1033,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; VI-FLUSH-LABEL: fmuladd_neg_2.0_neg_a_b_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 @@ -1027,7 +1051,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, pt ; ; VI-DENORM-LABEL: fmuladd_neg_2.0_neg_a_b_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1 @@ -1045,7 +1069,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, pt ; ; GFX10-FLUSH-LABEL: fmuladd_neg_2.0_neg_a_b_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -1059,7 +1083,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, pt ; ; GFX10-DENORM-LABEL: fmuladd_neg_2.0_neg_a_b_f16: ; GFX10-DENORM: ; %bb.0: -; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -1072,7 +1096,9 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, pt ; ; GFX11-FLUSH-LABEL: fmuladd_neg_2.0_neg_a_b_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -1089,7 +1115,9 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, pt ; ; GFX11-DENORM-LABEL: fmuladd_neg_2.0_neg_a_b_f16: ; GFX11-DENORM: ; %bb.0: -; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-DENORM-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -1119,7 +1147,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, pt define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; VI-FLUSH-LABEL: fmuladd_2.0_neg_a_b_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 @@ -1137,7 +1165,7 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr ad ; ; VI-DENORM-LABEL: fmuladd_2.0_neg_a_b_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1 @@ -1155,7 +1183,7 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX10-FLUSH-LABEL: fmuladd_2.0_neg_a_b_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -1169,7 +1197,7 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX10-DENORM-LABEL: fmuladd_2.0_neg_a_b_f16: ; GFX10-DENORM: ; %bb.0: -; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -1182,7 +1210,9 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX11-FLUSH-LABEL: fmuladd_2.0_neg_a_b_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -1199,7 +1229,9 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX11-DENORM-LABEL: fmuladd_2.0_neg_a_b_f16: ; GFX11-DENORM: ; %bb.0: -; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-DENORM-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -1229,7 +1261,7 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; VI-FLUSH-LABEL: fmuladd_2.0_a_neg_b_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 @@ -1247,7 +1279,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr ad ; ; VI-DENORM-LABEL: fmuladd_2.0_a_neg_b_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1 @@ -1265,7 +1297,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX10-FLUSH-LABEL: fmuladd_2.0_a_neg_b_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -1279,7 +1311,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX10-DENORM-LABEL: fmuladd_2.0_a_neg_b_f16: ; GFX10-DENORM: ; %bb.0: -; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -1292,7 +1324,9 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX11-FLUSH-LABEL: fmuladd_2.0_a_neg_b_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -1309,7 +1343,9 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX11-DENORM-LABEL: fmuladd_2.0_a_neg_b_f16: ; GFX11-DENORM: ; %bb.0: -; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-DENORM-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -1339,7 +1375,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 { ; VI-FLUSH-LABEL: mad_sub_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 @@ -1364,7 +1400,7 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out, ; ; VI-DENORM-CONTRACT-LABEL: mad_sub_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3 @@ -1389,7 +1425,7 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out, ; ; GFX10-FLUSH-LABEL: mad_sub_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc @@ -1405,7 +1441,7 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out, ; ; GFX10-DENORM-STRICT-LABEL: mad_sub_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc @@ -1421,7 +1457,7 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out, ; ; GFX10-DENORM-CONTRACT-LABEL: mad_sub_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc @@ -1436,7 +1472,9 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out, ; ; GFX11-FLUSH-LABEL: mad_sub_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -1455,7 +1493,9 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out, ; ; GFX11-DENORM-STRICT-LABEL: mad_sub_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -1474,7 +1514,9 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out, ; ; GFX11-DENORM-CONTRACT-LABEL: mad_sub_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -1508,7 +1550,7 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out, define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 { ; VI-FLUSH-LABEL: mad_sub_inv_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 @@ -1533,7 +1575,7 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o ; ; VI-DENORM-CONTRACT-LABEL: mad_sub_inv_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3 @@ -1558,7 +1600,7 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o ; ; GFX10-FLUSH-LABEL: mad_sub_inv_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc @@ -1574,7 +1616,7 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o ; ; GFX10-DENORM-STRICT-LABEL: mad_sub_inv_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc @@ -1590,7 +1632,7 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o ; ; GFX10-DENORM-CONTRACT-LABEL: mad_sub_inv_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc @@ -1605,7 +1647,9 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o ; ; GFX11-FLUSH-LABEL: mad_sub_inv_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -1624,7 +1668,9 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o ; ; GFX11-DENORM-STRICT-LABEL: mad_sub_inv_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -1643,7 +1689,9 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o ; ; GFX11-DENORM-CONTRACT-LABEL: mad_sub_inv_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -1677,7 +1725,7 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 { ; VI-FLUSH-LABEL: mad_sub_fabs_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 @@ -1702,7 +1750,7 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture % ; ; VI-DENORM-CONTRACT-LABEL: mad_sub_fabs_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3 @@ -1727,7 +1775,7 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture % ; ; GFX10-FLUSH-LABEL: mad_sub_fabs_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc @@ -1743,7 +1791,7 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture % ; ; GFX10-DENORM-STRICT-LABEL: mad_sub_fabs_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc @@ -1759,7 +1807,7 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture % ; ; GFX10-DENORM-CONTRACT-LABEL: mad_sub_fabs_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc @@ -1774,7 +1822,9 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture % ; ; GFX11-FLUSH-LABEL: mad_sub_fabs_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -1793,7 +1843,9 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture % ; ; GFX11-DENORM-STRICT-LABEL: mad_sub_fabs_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -1812,7 +1864,9 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture % ; ; GFX11-DENORM-CONTRACT-LABEL: mad_sub_fabs_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -1847,7 +1901,7 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture % define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 { ; VI-FLUSH-LABEL: mad_sub_fabs_inv_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 @@ -1872,7 +1926,7 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu ; ; VI-DENORM-CONTRACT-LABEL: mad_sub_fabs_inv_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3 @@ -1897,7 +1951,7 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu ; ; GFX10-FLUSH-LABEL: mad_sub_fabs_inv_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc @@ -1913,7 +1967,7 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu ; ; GFX10-DENORM-STRICT-LABEL: mad_sub_fabs_inv_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc @@ -1929,7 +1983,7 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu ; ; GFX10-DENORM-CONTRACT-LABEL: mad_sub_fabs_inv_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc @@ -1944,7 +1998,9 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu ; ; GFX11-FLUSH-LABEL: mad_sub_fabs_inv_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -1963,7 +2019,9 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu ; ; GFX11-DENORM-STRICT-LABEL: mad_sub_fabs_inv_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -1982,7 +2040,9 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu ; ; GFX11-DENORM-CONTRACT-LABEL: mad_sub_fabs_inv_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -2017,7 +2077,7 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 { ; VI-FLUSH-LABEL: neg_neg_mad_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 @@ -2042,7 +2102,7 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o ; ; VI-DENORM-CONTRACT-LABEL: neg_neg_mad_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3 @@ -2067,7 +2127,7 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o ; ; GFX10-FLUSH-LABEL: neg_neg_mad_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc @@ -2083,7 +2143,7 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o ; ; GFX10-DENORM-STRICT-LABEL: neg_neg_mad_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc @@ -2099,7 +2159,7 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o ; ; GFX10-DENORM-CONTRACT-LABEL: neg_neg_mad_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc @@ -2114,7 +2174,9 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o ; ; GFX11-FLUSH-LABEL: neg_neg_mad_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -2133,7 +2195,9 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o ; ; GFX11-DENORM-STRICT-LABEL: neg_neg_mad_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -2152,7 +2216,9 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o ; ; GFX11-DENORM-CONTRACT-LABEL: neg_neg_mad_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -2188,7 +2254,7 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 { ; VI-FLUSH-LABEL: mad_fabs_sub_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 @@ -2213,7 +2279,7 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture % ; ; VI-DENORM-CONTRACT-LABEL: mad_fabs_sub_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3 @@ -2238,7 +2304,7 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture % ; ; GFX10-FLUSH-LABEL: mad_fabs_sub_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc @@ -2254,7 +2320,7 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture % ; ; GFX10-DENORM-STRICT-LABEL: mad_fabs_sub_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc @@ -2270,7 +2336,7 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture % ; ; GFX10-DENORM-CONTRACT-LABEL: mad_fabs_sub_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc @@ -2285,7 +2351,9 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture % ; ; GFX11-FLUSH-LABEL: mad_fabs_sub_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -2304,7 +2372,9 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture % ; ; GFX11-DENORM-STRICT-LABEL: mad_fabs_sub_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -2323,7 +2393,9 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture % ; ; GFX11-DENORM-CONTRACT-LABEL: mad_fabs_sub_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -2358,7 +2430,7 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture % define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; VI-FLUSH-LABEL: fsub_c_fadd_a_a_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 @@ -2376,7 +2448,7 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp ; ; VI-DENORM-CONTRACT-LABEL: fsub_c_fadd_a_a_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s1 @@ -2394,7 +2466,7 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-FLUSH-LABEL: fsub_c_fadd_a_a_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -2408,7 +2480,7 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-DENORM-STRICT-LABEL: fsub_c_fadd_a_a_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -2422,7 +2494,7 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-DENORM-CONTRACT-LABEL: fsub_c_fadd_a_a_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -2435,7 +2507,9 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-FLUSH-LABEL: fsub_c_fadd_a_a_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -2452,7 +2526,9 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-DENORM-STRICT-LABEL: fsub_c_fadd_a_a_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -2469,7 +2545,9 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-DENORM-CONTRACT-LABEL: fsub_c_fadd_a_a_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -2499,7 +2577,7 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; VI-FLUSH-LABEL: fsub_fadd_a_a_c_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 @@ -2517,7 +2595,7 @@ define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrsp ; ; VI-DENORM-CONTRACT-LABEL: fsub_fadd_a_a_c_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s1 @@ -2535,7 +2613,7 @@ define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-FLUSH-LABEL: fsub_fadd_a_a_c_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -2549,7 +2627,7 @@ define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-DENORM-STRICT-LABEL: fsub_fadd_a_a_c_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -2563,7 +2641,7 @@ define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-DENORM-CONTRACT-LABEL: fsub_fadd_a_a_c_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -2576,7 +2654,9 @@ define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-FLUSH-LABEL: fsub_fadd_a_a_c_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -2593,7 +2673,9 @@ define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-DENORM-STRICT-LABEL: fsub_fadd_a_a_c_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -2610,7 +2692,9 @@ define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-DENORM-CONTRACT-LABEL: fsub_fadd_a_a_c_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc diff --git a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll index f411a76e75ab6..ba8b6fb80518f 100644 --- a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll +++ b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll @@ -15,8 +15,8 @@ declare <4 x double> @llvm.nearbyint.v4f64(<4 x double>) #0 define amdgpu_kernel void @fnearbyint_f16(ptr addrspace(1) %out, half %in) #1 { ; SI-LABEL: fnearbyint_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -28,23 +28,24 @@ define amdgpu_kernel void @fnearbyint_f16(ptr addrspace(1) %out, half %in) #1 { ; ; CI-LABEL: fnearbyint_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[0:1], 0xb -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_load_dword s0, s[2:3], 0xb ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_rndne_f32_e32 v0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: fnearbyint_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_rndne_f16_e32 v2, s2 +; VI-NEXT: v_rndne_f16_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_short v[0:1], v2 @@ -53,11 +54,11 @@ define amdgpu_kernel void @fnearbyint_f16(ptr addrspace(1) %out, half %in) #1 { ; GFX11-LABEL: fnearbyint_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_rndne_f16_e32 v1, s2 +; GFX11-NEXT: v_rndne_f16_e32 v1, s4 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -70,8 +71,8 @@ define amdgpu_kernel void @fnearbyint_f16(ptr addrspace(1) %out, half %in) #1 { define amdgpu_kernel void @fnearbyint_f32(ptr addrspace(1) %out, float %in) #1 { ; SICI-LABEL: fnearbyint_f32: ; SICI: ; %bb.0: ; %entry -; SICI-NEXT: s_load_dword s4, s[0:1], 0xb -; SICI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SICI-NEXT: s_load_dword s4, s[2:3], 0xb +; SICI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SICI-NEXT: s_mov_b32 s3, 0xf000 ; SICI-NEXT: s_mov_b32 s2, -1 ; SICI-NEXT: s_waitcnt lgkmcnt(0) @@ -81,10 +82,10 @@ define amdgpu_kernel void @fnearbyint_f32(ptr addrspace(1) %out, float %in) #1 { ; ; VI-LABEL: fnearbyint_f32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_rndne_f32_e32 v2, s2 +; VI-NEXT: v_rndne_f32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -93,11 +94,11 @@ define amdgpu_kernel void @fnearbyint_f32(ptr addrspace(1) %out, float %in) #1 { ; GFX11-LABEL: fnearbyint_f32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_rndne_f32_e32 v1, s2 +; GFX11-NEXT: v_rndne_f32_e32 v1, s4 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -111,7 +112,7 @@ entry: define amdgpu_kernel void @fnearbyint_v2f32(ptr addrspace(1) %out, <2 x float> %in) #1 { ; SICI-LABEL: fnearbyint_v2f32: ; SICI: ; %bb.0: ; %entry -; SICI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SICI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SICI-NEXT: s_mov_b32 s7, 0xf000 ; SICI-NEXT: s_mov_b32 s6, -1 ; SICI-NEXT: s_waitcnt lgkmcnt(0) @@ -124,7 +125,7 @@ define amdgpu_kernel void @fnearbyint_v2f32(ptr addrspace(1) %out, <2 x float> % ; ; VI-LABEL: fnearbyint_v2f32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_rndne_f32_e32 v1, s3 @@ -135,7 +136,7 @@ define amdgpu_kernel void @fnearbyint_v2f32(ptr addrspace(1) %out, <2 x float> % ; ; GFX11-LABEL: fnearbyint_v2f32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rndne_f32_e32 v1, s3 @@ -153,8 +154,8 @@ entry: define amdgpu_kernel void @fnearbyint_v4f32(ptr addrspace(1) %out, <4 x float> %in) #1 { ; SICI-LABEL: fnearbyint_v4f32: ; SICI: ; %bb.0: ; %entry -; SICI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; SICI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SICI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SICI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SICI-NEXT: s_mov_b32 s3, 0xf000 ; SICI-NEXT: s_mov_b32 s2, -1 ; SICI-NEXT: s_waitcnt lgkmcnt(0) @@ -167,8 +168,8 @@ define amdgpu_kernel void @fnearbyint_v4f32(ptr addrspace(1) %out, <4 x float> % ; ; VI-LABEL: fnearbyint_v4f32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_rndne_f32_e32 v3, s7 ; VI-NEXT: v_mov_b32_e32 v5, s1 @@ -182,8 +183,8 @@ define amdgpu_kernel void @fnearbyint_v4f32(ptr addrspace(1) %out, <4 x float> % ; GFX11-LABEL: fnearbyint_v4f32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rndne_f32_e32 v3, s7 @@ -203,7 +204,7 @@ entry: define amdgpu_kernel void @nearbyint_f64(ptr addrspace(1) %out, double %in) { ; SI-LABEL: nearbyint_f64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_brev_b32 s8, -2 @@ -227,7 +228,7 @@ define amdgpu_kernel void @nearbyint_f64(ptr addrspace(1) %out, double %in) { ; ; CI-LABEL: nearbyint_f64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_rndne_f64_e32 v[0:1], s[2:3] ; CI-NEXT: s_mov_b32 s3, 0xf000 @@ -237,7 +238,7 @@ define amdgpu_kernel void @nearbyint_f64(ptr addrspace(1) %out, double %in) { ; ; VI-LABEL: nearbyint_f64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_rndne_f64_e32 v[0:1], s[2:3] ; VI-NEXT: v_mov_b32_e32 v2, s0 @@ -247,7 +248,7 @@ define amdgpu_kernel void @nearbyint_f64(ptr addrspace(1) %out, double %in) { ; ; GFX11-LABEL: nearbyint_f64: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rndne_f64_e32 v[0:1], s[2:3] @@ -263,41 +264,41 @@ entry: define amdgpu_kernel void @nearbyint_v2f64(ptr addrspace(1) %out, <2 x double> %in) { ; SI-LABEL: nearbyint_v2f64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_brev_b32 s10, -2 ; SI-NEXT: v_mov_b32_e32 v6, 0x43300000 ; SI-NEXT: s_mov_b32 s9, 0x432fffff ; SI-NEXT: v_mov_b32_e32 v0, 0 -; SI-NEXT: s_mov_b32 s8, s6 +; SI-NEXT: s_mov_b32 s8, s2 ; SI-NEXT: v_mov_b32_e32 v4, s8 ; SI-NEXT: v_mov_b32_e32 v5, s9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v7, s3 +; SI-NEXT: v_mov_b32_e32 v7, s7 ; SI-NEXT: v_bfi_b32 v1, s10, v6, v7 -; SI-NEXT: v_mov_b32_e32 v8, s2 -; SI-NEXT: v_mov_b32_e32 v9, s1 -; SI-NEXT: v_mov_b32_e32 v10, s0 -; SI-NEXT: v_add_f64 v[2:3], s[2:3], v[0:1] +; SI-NEXT: v_mov_b32_e32 v8, s6 +; SI-NEXT: v_mov_b32_e32 v9, s5 +; SI-NEXT: v_mov_b32_e32 v10, s4 +; SI-NEXT: v_add_f64 v[2:3], s[6:7], v[0:1] ; SI-NEXT: v_add_f64 v[2:3], v[2:3], -v[0:1] ; SI-NEXT: v_bfi_b32 v1, s10, v6, v9 -; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[2:3]|, v[4:5] +; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[6:7]|, v[4:5] ; SI-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc ; SI-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc -; SI-NEXT: v_add_f64 v[6:7], s[0:1], v[0:1] +; SI-NEXT: v_add_f64 v[6:7], s[4:5], v[0:1] ; SI-NEXT: v_add_f64 v[0:1], v[6:7], -v[0:1] -; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[0:1]|, v[4:5] +; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[4:5]|, v[4:5] ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: nearbyint_v2f64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -308,8 +309,8 @@ define amdgpu_kernel void @nearbyint_v2f64(ptr addrspace(1) %out, <2 x double> % ; ; VI-LABEL: nearbyint_v2f64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_rndne_f64_e32 v[2:3], s[6:7] ; VI-NEXT: v_rndne_f64_e32 v[0:1], s[4:5] @@ -321,8 +322,8 @@ define amdgpu_kernel void @nearbyint_v2f64(ptr addrspace(1) %out, <2 x double> % ; GFX11-LABEL: nearbyint_v2f64: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rndne_f64_e32 v[2:3], s[6:7] @@ -340,8 +341,8 @@ entry: define amdgpu_kernel void @nearbyint_v4f64(ptr addrspace(1) %out, <4 x double> %in) { ; SI-LABEL: nearbyint_v4f64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x11 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x11 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_brev_b32 s14, -2 @@ -390,8 +391,8 @@ define amdgpu_kernel void @nearbyint_v4f64(ptr addrspace(1) %out, <4 x double> % ; ; CI-LABEL: nearbyint_v4f64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x11 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -405,8 +406,8 @@ define amdgpu_kernel void @nearbyint_v4f64(ptr addrspace(1) %out, <4 x double> % ; ; VI-LABEL: nearbyint_v4f64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_rndne_f64_e32 v[6:7], s[10:11] ; VI-NEXT: v_rndne_f64_e32 v[4:5], s[8:9] @@ -425,8 +426,8 @@ define amdgpu_kernel void @nearbyint_v4f64(ptr addrspace(1) %out, <4 x double> % ; GFX11-LABEL: nearbyint_v4f64: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x44 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x44 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v8, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rndne_f64_e32 v[6:7], s[10:11] diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll index b5440b9c38c9f..74e2b9ea71425 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll @@ -2799,7 +2799,7 @@ define <2 x half> @fadd_select_fneg_fneg_v2f16(i32 %arg0, <2 x half> %x, <2 x ha define amdgpu_kernel void @s_fneg_select_infloop_regression_f32(float %arg, i1 %arg1, ptr addrspace(1) %ptr) { ; SI-LABEL: s_fneg_select_infloop_regression_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bitcmp1_b32 s1, 0 ; SI-NEXT: v_mov_b32_e32 v0, s0 @@ -2813,7 +2813,7 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_f32(float %arg, i1 % ; ; VI-LABEL: s_fneg_select_infloop_regression_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitcmp1_b32 s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -3016,41 +3016,41 @@ define float @v_fneg_select_infloop_regression_neg_inline_imm_f32_commute2(float define amdgpu_kernel void @s_fneg_select_infloop_regression_f64(double %arg, i1 %arg1, ptr addrspace(1) %ptr) { ; SI-LABEL: s_fneg_select_infloop_regression_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0xd ; SI-NEXT: v_bfrev_b32_e32 v0, 1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bitcmp1_b32 s4, 0 ; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 -; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: s_and_b64 s[6:7], s[4:5], exec ; SI-NEXT: v_cndmask_b32_e64 v0, -v1, v0, s[4:5] -; SI-NEXT: s_cselect_b32 s2, 0, s2 -; SI-NEXT: v_mov_b32_e32 v3, s1 +; SI-NEXT: s_cselect_b32 s0, 0, s0 +; SI-NEXT: v_mov_b32_e32 v2, s2 ; SI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5] -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v2, s0 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v3, s3 ; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_fneg_select_infloop_regression_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x34 ; VI-NEXT: v_bfrev_b32_e32 v0, 1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitcmp1_b32 s4, 0 ; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_and_b64 s[6:7], s[4:5], exec ; VI-NEXT: v_cndmask_b32_e64 v0, -v1, v0, s[4:5] -; VI-NEXT: s_cselect_b32 s2, 0, s2 -; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: s_cselect_b32 s0, 0, s0 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5] -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm %i = select i1 %arg1, double 0.0, double %arg @@ -3080,11 +3080,11 @@ define double @v_fneg_select_infloop_regression_f64(double %arg, i1 %arg1) { define amdgpu_kernel void @s_fneg_select_infloop_regression_f16(half %arg, i1 %arg1, ptr addrspace(1) %ptr) { ; SI-LABEL: s_fneg_select_infloop_regression_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SI-NEXT: s_load_dword s4, s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; SI-NEXT: s_bitcmp1_b32 s2, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_bitcmp1_b32 s4, 16 ; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[2:3] ; SI-NEXT: v_cndmask_b32_e64 v0, -v0, 0, s[2:3] @@ -3096,11 +3096,11 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_f16(half %arg, i1 %a ; ; VI-LABEL: s_fneg_select_infloop_regression_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dword s4, s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bitcmp1_b32 s2, 16 -; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: s_bitcmp1_b32 s4, 16 +; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 ; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[2:3] ; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v0 @@ -3146,7 +3146,7 @@ define half @v_fneg_select_infloop_regression_f16(half %arg, i1 %arg1) { define amdgpu_kernel void @s_fneg_select_infloop_regression_v2f16(<2 x half> %arg, i1 %arg1, ptr addrspace(1) %ptr) { ; SI-LABEL: s_fneg_select_infloop_regression_v2f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_and_b32 s1, 1, s1 ; SI-NEXT: s_cselect_b32 s0, 0, s0 @@ -3161,7 +3161,7 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_v2f16(<2 x half> %ar ; ; VI-LABEL: s_fneg_select_infloop_regression_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s1, 1, s1 ; VI-NEXT: s_cselect_b32 s0, 0, s0 @@ -3216,8 +3216,8 @@ define <2 x half> @v_fneg_select_infloop_regression_v2f16(<2 x half> %arg, i1 %a define amdgpu_kernel void @s_fneg_select_infloop_regression_v2f32(<2 x float> %arg, i1 %arg1, ptr addrspace(1) %ptr) { ; SI-LABEL: s_fneg_select_infloop_regression_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: v_bfrev_b32_e32 v0, 1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bitcmp1_b32 s6, 0 @@ -3235,8 +3235,8 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_v2f32(<2 x float> %a ; ; VI-LABEL: s_fneg_select_infloop_regression_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_bfrev_b32_e32 v0, 1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitcmp1_b32 s6, 0 @@ -3279,7 +3279,7 @@ define <2 x float> @v_fneg_select_infloop_regression_v2f32(<2 x float> %arg, i1 define amdgpu_kernel void @s_fabs_select_infloop_regression_f32(float %arg, i1 %arg1, ptr addrspace(1) %ptr) { ; SI-LABEL: s_fabs_select_infloop_regression_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bitcmp1_b32 s1, 0 ; SI-NEXT: v_mov_b32_e32 v0, s0 @@ -3293,7 +3293,7 @@ define amdgpu_kernel void @s_fabs_select_infloop_regression_f32(float %arg, i1 % ; ; VI-LABEL: s_fabs_select_infloop_regression_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitcmp1_b32 s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -3329,7 +3329,7 @@ define float @v_fabs_select_infloop_regression_f32(float %arg, i1 %arg1) { define amdgpu_kernel void @s_fneg_fabs_select_infloop_regression(float %arg, i1 %arg1, ptr addrspace(1) %ptr) { ; SI-LABEL: s_fneg_fabs_select_infloop_regression: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bitcmp1_b32 s1, 0 ; SI-NEXT: v_mov_b32_e32 v0, s0 @@ -3343,7 +3343,7 @@ define amdgpu_kernel void @s_fneg_fabs_select_infloop_regression(float %arg, i1 ; ; VI-LABEL: s_fneg_fabs_select_infloop_regression: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitcmp1_b32 s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll index 4364b32e62f8c..8267bb9f5450f 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll @@ -7,12 +7,12 @@ define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, half %y) { ; CI-LABEL: fneg_fabs_fadd_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[4:5], 0x2 +; CI-NEXT: s_load_dword s0, s[6:7], 0x2 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e64 v0, |s0| ; CI-NEXT: s_lshr_b32 s0, s0, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_sub_f32_e32 v0, v1, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -23,8 +23,8 @@ define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, ha ; ; VI-LABEL: fneg_fabs_fadd_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -36,8 +36,8 @@ define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, ha ; ; GFX9-LABEL: fneg_fabs_fadd_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s3, s2, 16 @@ -49,13 +49,13 @@ define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, ha ; GFX11-LABEL: fneg_fabs_fadd_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s3, s2, 16 +; GFX11-NEXT: s_lshr_b32 s2, s4, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_sub_f16_e64 v1, s3, |s2| +; GFX11-NEXT: v_sub_f16_e64 v1, s2, |s4| ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -70,13 +70,13 @@ define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, ha define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, half %y) { ; CI-LABEL: fneg_fabs_fmul_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[4:5], 0x2 +; CI-NEXT: s_load_dword s0, s[6:7], 0x2 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s1, s0, 0x7fff ; CI-NEXT: s_lshr_b32 s0, s0, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 ; CI-NEXT: v_cvt_f32_f16_e64 v1, -|s1| -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_mul_f32_e32 v0, v0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -87,8 +87,8 @@ define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, ha ; ; VI-LABEL: fneg_fabs_fmul_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -100,8 +100,8 @@ define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, ha ; ; GFX9-LABEL: fneg_fabs_fmul_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s3, s2, 16 @@ -113,13 +113,13 @@ define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, ha ; GFX11-LABEL: fneg_fabs_fmul_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s3, s2, 16 +; GFX11-NEXT: s_lshr_b32 s2, s4, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mul_f16_e64 v1, s3, -|s2| +; GFX11-NEXT: v_mul_f16_e64 v1, s2, -|s4| ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -137,8 +137,8 @@ define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, ha define amdgpu_kernel void @fneg_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { ; CI-LABEL: fneg_fabs_free_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[4:5], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dword s2, s[6:7], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_bitset1_b32 s2, 15 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -149,8 +149,8 @@ define amdgpu_kernel void @fneg_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { ; ; VI-LABEL: fneg_fabs_free_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitset1_b32 s2, 15 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -161,8 +161,8 @@ define amdgpu_kernel void @fneg_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { ; ; GFX9-LABEL: fneg_fabs_free_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bitset1_b32 s2, 15 @@ -173,10 +173,10 @@ define amdgpu_kernel void @fneg_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { ; GFX11-LABEL: fneg_fabs_free_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_bitset1_b32 s2, 15 +; GFX11-NEXT: s_or_b32 s2, s4, 0x8000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -193,8 +193,8 @@ define amdgpu_kernel void @fneg_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { define amdgpu_kernel void @fneg_fabs_f16(ptr addrspace(1) %out, half %in) { ; CI-LABEL: fneg_fabs_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[4:5], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dword s2, s[6:7], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_bitset1_b32 s2, 15 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -205,8 +205,8 @@ define amdgpu_kernel void @fneg_fabs_f16(ptr addrspace(1) %out, half %in) { ; ; VI-LABEL: fneg_fabs_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitset1_b32 s2, 15 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -217,8 +217,8 @@ define amdgpu_kernel void @fneg_fabs_f16(ptr addrspace(1) %out, half %in) { ; ; GFX9-LABEL: fneg_fabs_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bitset1_b32 s2, 15 @@ -229,10 +229,10 @@ define amdgpu_kernel void @fneg_fabs_f16(ptr addrspace(1) %out, half %in) { ; GFX11-LABEL: fneg_fabs_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_bitset1_b32 s2, 15 +; GFX11-NEXT: s_or_b32 s2, s4, 0x8000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -248,7 +248,7 @@ define amdgpu_kernel void @fneg_fabs_f16(ptr addrspace(1) %out, half %in) { define amdgpu_kernel void @v_fneg_fabs_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; CIVI-LABEL: v_fneg_fabs_f16: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -262,7 +262,7 @@ define amdgpu_kernel void @v_fneg_fabs_f16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: v_fneg_fabs_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] @@ -273,7 +273,7 @@ define amdgpu_kernel void @v_fneg_fabs_f16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11-LABEL: v_fneg_fabs_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] @@ -293,12 +293,12 @@ define amdgpu_kernel void @v_fneg_fabs_f16(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, <2 x half> %in) { ; CI-LABEL: s_fneg_fabs_v2f16_non_bc_src: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[4:5], 0x2 +; CI-NEXT: s_load_dword s0, s[6:7], 0x2 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s1, s0, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s1 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_add_f32_e32 v1, 2.0, v1 ; CI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -314,8 +314,8 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, < ; ; VI-LABEL: s_fneg_fabs_v2f16_non_bc_src: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0x4000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 @@ -331,8 +331,8 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, < ; ; GFX9-LABEL: s_fneg_fabs_v2f16_non_bc_src: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40003c00 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -344,11 +344,11 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, < ; GFX11-LABEL: s_fneg_fabs_v2f16_non_bc_src: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_add_f16 v0, 0x40003c00, s2 +; GFX11-NEXT: v_pk_add_f16 v0, 0x40003c00, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_or_b32_e32 v0, 0x80008000, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -367,8 +367,8 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, < define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(ptr addrspace(1) %out, <2 x half> %in) { ; CI-LABEL: s_fneg_fabs_v2f16_bc_src: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[4:5], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dword s2, s[6:7], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_or_b32 s2, s2, 0x80008000 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -379,8 +379,8 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(ptr addrspace(1) %out, <2 x ; ; VI-LABEL: s_fneg_fabs_v2f16_bc_src: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_or_b32 s2, s2, 0x80008000 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -391,8 +391,8 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(ptr addrspace(1) %out, <2 x ; ; GFX9-LABEL: s_fneg_fabs_v2f16_bc_src: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_or_b32 s2, s2, 0x80008000 @@ -403,10 +403,10 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(ptr addrspace(1) %out, <2 x ; GFX11-LABEL: s_fneg_fabs_v2f16_bc_src: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_or_b32 s2, s2, 0x80008000 +; GFX11-NEXT: s_or_b32 s2, s4, 0x80008000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -422,7 +422,7 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(ptr addrspace(1) %out, <2 x define amdgpu_kernel void @fneg_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) { ; CIVI-LABEL: fneg_fabs_v4f16: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: s_or_b32 s3, s3, 0x80008000 ; CIVI-NEXT: s_or_b32 s2, s2, 0x80008000 @@ -435,7 +435,7 @@ define amdgpu_kernel void @fneg_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in ; ; GFX9-LABEL: fneg_fabs_v4f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_or_b32 s3, s3, 0x80008000 @@ -447,7 +447,7 @@ define amdgpu_kernel void @fneg_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in ; ; GFX11-LABEL: fneg_fabs_v4f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_or_b32 s2, s2, 0x80008000 ; GFX11-NEXT: s_or_b32 s3, s3, 0x80008000 @@ -467,12 +467,12 @@ define amdgpu_kernel void @fneg_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) #0 { ; CI-LABEL: fold_user_fneg_fabs_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[4:5], 0x2 +; CI-NEXT: s_load_dword s0, s[6:7], 0x2 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s1, s0, 16 ; CI-NEXT: v_cvt_f32_f16_e64 v1, |s1| ; CI-NEXT: v_cvt_f32_f16_e64 v0, |s0| -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_mul_f32_e32 v1, -4.0, v1 ; CI-NEXT: v_mul_f32_e32 v0, -4.0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -487,8 +487,8 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x ; ; VI-LABEL: fold_user_fneg_fabs_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0xc400 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 @@ -503,8 +503,8 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x ; ; GFX9-LABEL: fold_user_fneg_fabs_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff7fff @@ -515,11 +515,11 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x ; GFX11-LABEL: fold_user_fneg_fabs_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff7fff +; GFX11-NEXT: s_and_b32 s2, s4, 0x7fff7fff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_pk_mul_f16 v1, s2, -4.0 op_sel_hi:[1,0] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -536,8 +536,8 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x half> %in) { ; CI-LABEL: s_fneg_multi_use_fabs_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: s_load_dword s4, s[4:5], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dword s4, s[6:7], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: s_and_b32 s0, s4, 0x7fff7fff @@ -553,8 +553,8 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(ptr addrspace(1) %out0, p ; ; VI-LABEL: s_fneg_multi_use_fabs_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dword s4, s[6:7], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_and_b32 s0, s4, 0x7fff7fff @@ -570,11 +570,11 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(ptr addrspace(1) %out0, p ; ; GFX9-LABEL: s_fneg_multi_use_fabs_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s4, s6, 0x7fff7fff +; GFX9-NEXT: s_and_b32 s4, s4, 0x7fff7fff ; GFX9-NEXT: s_xor_b32 s5, s4, 0x80008000 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] @@ -585,8 +585,8 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(ptr addrspace(1) %out0, p ; GFX11-LABEL: s_fneg_multi_use_fabs_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x10 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x10 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s4, s4, 0x7fff7fff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) @@ -609,8 +609,8 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(ptr addrspace(1) %out0, p define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x half> %in) { ; CI-LABEL: s_fneg_multi_use_fabs_foldable_neg_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: s_load_dword s4, s[4:5], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dword s4, s[6:7], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: s_bfe_u32 s0, s4, 0xf0010 @@ -633,8 +633,8 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(ptr addrspac ; ; VI-LABEL: s_fneg_multi_use_fabs_foldable_neg_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dword s4, s[6:7], 0x10 ; VI-NEXT: v_mov_b32_e32 v5, 0xc400 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -654,11 +654,11 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(ptr addrspac ; ; GFX9-LABEL: s_fneg_multi_use_fabs_foldable_neg_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s4, s6, 0x7fff7fff +; GFX9-NEXT: s_and_b32 s4, s4, 0x7fff7fff ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_pk_mul_f16 v1, s4, -4.0 op_sel_hi:[1,0] ; GFX9-NEXT: global_store_dword v0, v2, s[0:1] @@ -668,8 +668,8 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(ptr addrspac ; GFX11-LABEL: s_fneg_multi_use_fabs_foldable_neg_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x10 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x10 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s4, s4, 0x7fff7fff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll index 3c000d4fa63a3..6446145bbfe2a 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @fneg_fabsf_fadd_f32(ptr addrspace(1) %out, float %x, float %y) { ; SI-LABEL: fneg_fabsf_fadd_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -18,7 +18,7 @@ define amdgpu_kernel void @fneg_fabsf_fadd_f32(ptr addrspace(1) %out, float %x, ; ; VI-LABEL: fneg_fabsf_fadd_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_sub_f32_e64 v2, s3, |v0| @@ -36,7 +36,7 @@ define amdgpu_kernel void @fneg_fabsf_fadd_f32(ptr addrspace(1) %out, float %x, define amdgpu_kernel void @fneg_fabsf_fmul_f32(ptr addrspace(1) %out, float %x, float %y) { ; SI-LABEL: fneg_fabsf_fmul_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -49,7 +49,7 @@ define amdgpu_kernel void @fneg_fabsf_fmul_f32(ptr addrspace(1) %out, float %x, ; ; VI-LABEL: fneg_fabsf_fmul_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mul_f32_e64 v2, s3, -|v0| @@ -67,11 +67,11 @@ define amdgpu_kernel void @fneg_fabsf_fmul_f32(ptr addrspace(1) %out, float %x, define amdgpu_kernel void @fneg_fabsf_free_f32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: fneg_fabsf_free_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_or_b32 s4, s2, 0x80000000 +; SI-NEXT: s_bitset1_b32 s4, 31 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -79,10 +79,10 @@ define amdgpu_kernel void @fneg_fabsf_free_f32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: fneg_fabsf_free_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bitset1_b32 s2, 31 +; VI-NEXT: s_or_b32 s2, s4, 0x80000000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -129,11 +129,11 @@ define amdgpu_kernel void @fneg_fabsf_fn_free_f32(ptr addrspace(1) %out, i32 %in define amdgpu_kernel void @fneg_fabsf_f32(ptr addrspace(1) %out, float %in) { ; SI-LABEL: fneg_fabsf_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_or_b32 s4, s2, 0x80000000 +; SI-NEXT: s_bitset1_b32 s4, 31 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -141,10 +141,10 @@ define amdgpu_kernel void @fneg_fabsf_f32(ptr addrspace(1) %out, float %in) { ; ; VI-LABEL: fneg_fabsf_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bitset1_b32 s2, 31 +; VI-NEXT: s_or_b32 s2, s4, 0x80000000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -159,7 +159,7 @@ define amdgpu_kernel void @fneg_fabsf_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @v_fneg_fabsf_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_fneg_fabsf_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -177,7 +177,7 @@ define amdgpu_kernel void @v_fneg_fabsf_f32(ptr addrspace(1) %out, ptr addrspace ; ; VI-LABEL: v_fneg_fabsf_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -198,7 +198,7 @@ define amdgpu_kernel void @v_fneg_fabsf_f32(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @fneg_fabsf_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; SI-LABEL: fneg_fabsf_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bitset1_b32 s3, 31 @@ -213,7 +213,7 @@ define amdgpu_kernel void @fneg_fabsf_v2f32(ptr addrspace(1) %out, <2 x float> % ; ; VI-LABEL: fneg_fabsf_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitset1_b32 s3, 31 ; VI-NEXT: s_bitset1_b32 s2, 31 @@ -232,8 +232,8 @@ define amdgpu_kernel void @fneg_fabsf_v2f32(ptr addrspace(1) %out, <2 x float> % define amdgpu_kernel void @fneg_fabsf_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; SI-LABEL: fneg_fabsf_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bitset1_b32 s7, 31 @@ -250,8 +250,8 @@ define amdgpu_kernel void @fneg_fabsf_v4f32(ptr addrspace(1) %out, <4 x float> % ; ; VI-LABEL: fneg_fabsf_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_or_b32 s2, s7, 0x80000000 ; VI-NEXT: s_or_b32 s3, s6, 0x80000000 diff --git a/llvm/test/CodeGen/AMDGPU/fneg.ll b/llvm/test/CodeGen/AMDGPU/fneg.ll index d78bdfe08772a..e447429539e6f 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg.ll @@ -7,8 +7,8 @@ define amdgpu_kernel void @s_fneg_f32(ptr addrspace(1) %out, float %in) { ; SI-LABEL: s_fneg_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -19,10 +19,10 @@ define amdgpu_kernel void @s_fneg_f32(ptr addrspace(1) %out, float %in) { ; ; VI-LABEL: s_fneg_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_xor_b32 s2, s2, 0x80000000 +; VI-NEXT: s_xor_b32 s2, s4, 0x80000000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -32,10 +32,10 @@ define amdgpu_kernel void @s_fneg_f32(ptr addrspace(1) %out, float %in) { ; GFX11-LABEL: s_fneg_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_xor_b32 s2, s2, 0x80000000 +; GFX11-NEXT: s_xor_b32 s2, s4, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -50,7 +50,7 @@ define amdgpu_kernel void @s_fneg_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @s_fneg_v2f32(ptr addrspace(1) nocapture %out, <2 x float> %in) { ; SI-LABEL: s_fneg_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -65,7 +65,7 @@ define amdgpu_kernel void @s_fneg_v2f32(ptr addrspace(1) nocapture %out, <2 x fl ; ; VI-LABEL: s_fneg_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_xor_b32 s3, s3, 0x80000000 ; VI-NEXT: s_xor_b32 s2, s2, 0x80000000 @@ -78,7 +78,7 @@ define amdgpu_kernel void @s_fneg_v2f32(ptr addrspace(1) nocapture %out, <2 x fl ; ; GFX11-LABEL: s_fneg_v2f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_xor_b32 s2, s2, 0x80000000 ; GFX11-NEXT: s_xor_b32 s3, s3, 0x80000000 @@ -97,8 +97,8 @@ define amdgpu_kernel void @s_fneg_v2f32(ptr addrspace(1) nocapture %out, <2 x fl define amdgpu_kernel void @s_fneg_v4f32(ptr addrspace(1) nocapture %out, <4 x float> %in) { ; SI-LABEL: s_fneg_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -115,8 +115,8 @@ define amdgpu_kernel void @s_fneg_v4f32(ptr addrspace(1) nocapture %out, <4 x fl ; ; VI-LABEL: s_fneg_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_xor_b32 s2, s7, 0x80000000 ; VI-NEXT: s_xor_b32 s3, s6, 0x80000000 @@ -134,8 +134,8 @@ define amdgpu_kernel void @s_fneg_v4f32(ptr addrspace(1) nocapture %out, <4 x fl ; GFX11-LABEL: s_fneg_v4f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_xor_b32 s2, s7, 0x80000000 ; GFX11-NEXT: s_xor_b32 s3, s6, 0x80000000 @@ -157,8 +157,8 @@ define amdgpu_kernel void @s_fneg_v4f32(ptr addrspace(1) nocapture %out, <4 x fl define amdgpu_kernel void @fsub0_f32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: fsub0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -168,10 +168,10 @@ define amdgpu_kernel void @fsub0_f32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: fsub0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_sub_f32_e64 v2, 0, s2 +; VI-NEXT: v_sub_f32_e64 v2, 0, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -180,11 +180,11 @@ define amdgpu_kernel void @fsub0_f32(ptr addrspace(1) %out, i32 %in) { ; GFX11-LABEL: fsub0_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_sub_f32_e64 v1, 0, s2 +; GFX11-NEXT: v_sub_f32_e64 v1, 0, s4 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -198,8 +198,8 @@ define amdgpu_kernel void @fsub0_f32(ptr addrspace(1) %out, i32 %in) { define amdgpu_kernel void @fneg_free_f32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: fneg_free_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -210,10 +210,10 @@ define amdgpu_kernel void @fneg_free_f32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: fneg_free_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_xor_b32 s2, s2, 0x80000000 +; VI-NEXT: s_xor_b32 s2, s4, 0x80000000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -223,10 +223,10 @@ define amdgpu_kernel void @fneg_free_f32(ptr addrspace(1) %out, i32 %in) { ; GFX11-LABEL: fneg_free_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_xor_b32 s2, s2, 0x80000000 +; GFX11-NEXT: s_xor_b32 s2, s4, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -242,8 +242,8 @@ define amdgpu_kernel void @fneg_free_f32(ptr addrspace(1) %out, i32 %in) { define amdgpu_kernel void @fneg_fold_f32(ptr addrspace(1) %out, float %in) { ; SI-LABEL: fneg_fold_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -253,10 +253,10 @@ define amdgpu_kernel void @fneg_fold_f32(ptr addrspace(1) %out, float %in) { ; ; VI-LABEL: fneg_fold_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mul_f32_e64 v2, -s2, s2 +; VI-NEXT: v_mul_f32_e64 v2, -s4, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -265,11 +265,11 @@ define amdgpu_kernel void @fneg_fold_f32(ptr addrspace(1) %out, float %in) { ; GFX11-LABEL: fneg_fold_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mul_f32_e64 v1, -s2, s2 +; GFX11-NEXT: v_mul_f32_e64 v1, -s4, s4 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -284,8 +284,8 @@ define amdgpu_kernel void @fneg_fold_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @bitpreserve_fneg_f32(ptr addrspace(1) %out, float %in) { ; SI-LABEL: bitpreserve_fneg_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -295,10 +295,10 @@ define amdgpu_kernel void @bitpreserve_fneg_f32(ptr addrspace(1) %out, float %in ; ; VI-LABEL: bitpreserve_fneg_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mul_f32_e64 v2, s2, -4.0 +; VI-NEXT: v_mul_f32_e64 v2, s4, -4.0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -307,11 +307,11 @@ define amdgpu_kernel void @bitpreserve_fneg_f32(ptr addrspace(1) %out, float %in ; GFX11-LABEL: bitpreserve_fneg_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mul_f32_e64 v1, s2, -4.0 +; GFX11-NEXT: v_mul_f32_e64 v1, s4, -4.0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -327,8 +327,8 @@ define amdgpu_kernel void @bitpreserve_fneg_f32(ptr addrspace(1) %out, float %in define amdgpu_kernel void @s_fneg_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: s_fneg_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -339,10 +339,10 @@ define amdgpu_kernel void @s_fneg_i32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: s_fneg_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_xor_b32 s2, s2, 0x80000000 +; VI-NEXT: s_xor_b32 s2, s4, 0x80000000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -352,10 +352,10 @@ define amdgpu_kernel void @s_fneg_i32(ptr addrspace(1) %out, i32 %in) { ; GFX11-LABEL: s_fneg_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_xor_b32 s2, s2, 0x80000000 +; GFX11-NEXT: s_xor_b32 s2, s4, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -380,8 +380,8 @@ define i32 @v_fneg_i32(i32 %in) { define amdgpu_kernel void @s_fneg_i32_fp_use(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: s_fneg_i32_fp_use: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -391,10 +391,10 @@ define amdgpu_kernel void @s_fneg_i32_fp_use(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: s_fneg_i32_fp_use: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_sub_f32_e64 v2, 2.0, s2 +; VI-NEXT: v_sub_f32_e64 v2, 2.0, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -403,11 +403,11 @@ define amdgpu_kernel void @s_fneg_i32_fp_use(ptr addrspace(1) %out, i32 %in) { ; GFX11-LABEL: s_fneg_i32_fp_use: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_sub_f32_e64 v1, 2.0, s2 +; GFX11-NEXT: v_sub_f32_e64 v1, 2.0, s4 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -434,7 +434,7 @@ define float @v_fneg_i32_fp_use(i32 %in) { define amdgpu_kernel void @s_fneg_i64(ptr addrspace(1) %out, i64 %in) { ; SI-LABEL: s_fneg_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -448,7 +448,7 @@ define amdgpu_kernel void @s_fneg_i64(ptr addrspace(1) %out, i64 %in) { ; ; VI-LABEL: s_fneg_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_xor_b32 s0, s3, 0x80000000 @@ -460,7 +460,7 @@ define amdgpu_kernel void @s_fneg_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX11-LABEL: s_fneg_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_xor_b32 s3, s3, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -488,7 +488,7 @@ define i64 @v_fneg_i64(i64 %in) { define amdgpu_kernel void @s_fneg_i64_fp_use(ptr addrspace(1) %out, i64 %in) { ; SI-LABEL: s_fneg_i64_fp_use: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -500,7 +500,7 @@ define amdgpu_kernel void @s_fneg_i64_fp_use(ptr addrspace(1) %out, i64 %in) { ; ; VI-LABEL: s_fneg_i64_fp_use: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_f64 v[0:1], -s[2:3], 2.0 ; VI-NEXT: v_mov_b32_e32 v2, s0 @@ -510,7 +510,7 @@ define amdgpu_kernel void @s_fneg_i64_fp_use(ptr addrspace(1) %out, i64 %in) { ; ; GFX11-LABEL: s_fneg_i64_fp_use: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_f64 v[0:1], -s[2:3], 2.0 @@ -550,23 +550,24 @@ define i16 @v_fneg_i16(i16 %in) { define amdgpu_kernel void @s_fneg_i16_fp_use(ptr addrspace(1) %out, i16 %in) { ; SI-LABEL: s_fneg_i16_fp_use: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dword s0, s[2:3], 0xb ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_sub_f32_e32 v0, 2.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_fneg_i16_fp_use: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_sub_f16_e64 v2, 2.0, s2 +; VI-NEXT: v_sub_f16_e64 v2, 2.0, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_short v[0:1], v2 @@ -575,11 +576,11 @@ define amdgpu_kernel void @s_fneg_i16_fp_use(ptr addrspace(1) %out, i16 %in) { ; GFX11-LABEL: s_fneg_i16_fp_use: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_sub_f16_e64 v1, 2.0, s2 +; GFX11-NEXT: v_sub_f16_e64 v1, 2.0, s4 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -619,8 +620,8 @@ define half @v_fneg_i16_fp_use(i16 %in) { define amdgpu_kernel void @s_fneg_v2i16(ptr addrspace(1) %out, i32 %arg) { ; SI-LABEL: s_fneg_v2i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -631,15 +632,15 @@ define amdgpu_kernel void @s_fneg_v2i16(ptr addrspace(1) %out, i32 %arg) { ; ; VI-LABEL: s_fneg_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s3, s2, 16 +; VI-NEXT: s_lshr_b32 s2, s4, 16 +; VI-NEXT: s_xor_b32 s3, s4, 0x8000 ; VI-NEXT: s_xor_b32 s2, s2, 0x8000 -; VI-NEXT: s_xor_b32 s3, s3, 0x8000 -; VI-NEXT: s_and_b32 s2, s2, 0xffff -; VI-NEXT: s_lshl_b32 s3, s3, 16 -; VI-NEXT: s_or_b32 s2, s2, s3 +; VI-NEXT: s_and_b32 s3, s3, 0xffff +; VI-NEXT: s_lshl_b32 s2, s2, 16 +; VI-NEXT: s_or_b32 s2, s3, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -649,10 +650,10 @@ define amdgpu_kernel void @s_fneg_v2i16(ptr addrspace(1) %out, i32 %arg) { ; GFX11-LABEL: s_fneg_v2i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_xor_b32 s2, s2, 0x80008000 +; GFX11-NEXT: s_xor_b32 s2, s4, 0x80008000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -695,34 +696,35 @@ define <2 x i16> @v_fneg_v2i16(<2 x i16> %in) { define amdgpu_kernel void @s_fneg_v2i16_fp_use(ptr addrspace(1) %out, i32 %arg) { ; SI-LABEL: s_fneg_v2i16_fp_use: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[2:3], 0xb ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s3, s2, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s2 +; SI-NEXT: s_lshr_b32 s1, s0, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_sub_f32_e32 v0, 2.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_sub_f32_e32 v1, 2.0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_fneg_v2i16_fp_use: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x4000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s3, s2, 16 -; VI-NEXT: s_xor_b32 s3, s3, 0x8000 +; VI-NEXT: s_lshr_b32 s2, s4, 16 ; VI-NEXT: s_xor_b32 s2, s2, 0x8000 -; VI-NEXT: v_mov_b32_e32 v2, s3 -; VI-NEXT: v_add_f16_e64 v1, s2, 2.0 +; VI-NEXT: s_xor_b32 s3, s4, 0x8000 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_add_f16_e64 v1, s3, 2.0 ; VI-NEXT: v_add_f16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v1, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -733,11 +735,11 @@ define amdgpu_kernel void @s_fneg_v2i16_fp_use(ptr addrspace(1) %out, i32 %arg) ; GFX11-LABEL: s_fneg_v2i16_fp_use: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_add_f16 v1, s2, 2.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0] +; GFX11-NEXT: v_pk_add_f16 v1, s4, 2.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll index 9f339af0f5580..1914b74be1909 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll @@ -16,18 +16,24 @@ declare <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %d define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, <2 x half> %data) { ; GFX12-SDAG-LABEL: local_atomic_fadd_v2f16_noret: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SE ; GFX12-SDAG-NEXT: ds_pk_add_f16 v0, v1 +; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: local_atomic_fadd_v2f16_noret: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SE ; GFX12-GISEL-NEXT: ds_pk_add_f16 v0, v1 +; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE ; GFX12-GISEL-NEXT: s_endpgm %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0) ret void @@ -36,22 +42,24 @@ define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, <2 x i16> %data) { ; GFX12-SDAG-LABEL: local_atomic_fadd_v2bf16_noret: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SE ; GFX12-SDAG-NEXT: ds_pk_add_bf16 v0, v1 ; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 -; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SYS +; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: local_atomic_fadd_v2bf16_noret: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 -; GFX12-GISEL-NEXT: ds_pk_add_bf16 v1, v0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SE +; GFX12-GISEL-NEXT: ds_pk_add_f16 v0, v1 ; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 -; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SYS +; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE ; GFX12-GISEL-NEXT: s_endpgm %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data) ret void @@ -65,8 +73,11 @@ define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half> ; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SE +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX12-SDAG-NEXT: ds_pk_add_rtn_f16 v0, v0, v1 ; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-GISEL-LABEL: local_atomic_fadd_v2f16_rtn: @@ -76,8 +87,11 @@ define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half> ; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SE +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: ds_pk_add_rtn_f16 v0, v0, v1 ; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0) ret <2 x half> %ret @@ -91,10 +105,11 @@ define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16> ; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SE ; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX12-SDAG-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1 ; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 -; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SYS +; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-GISEL-LABEL: local_atomic_fadd_v2bf16_rtn: @@ -104,10 +119,11 @@ define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16> ; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SE ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 -; GFX12-GISEL-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1 +; GFX12-GISEL-NEXT: ds_pk_add_rtn_f16 v0, v0, v1 ; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 -; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SYS +; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data) ret <2 x i16> %ret @@ -116,7 +132,7 @@ define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16> define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(ptr %ptr, <2 x half> %data) { ; GFX12-SDAG-LABEL: flat_atomic_fadd_v2f16_noret: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2 @@ -125,7 +141,7 @@ define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(ptr %ptr, <2 x half> %da ; ; GFX12-GISEL-LABEL: flat_atomic_fadd_v2f16_noret: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2 @@ -164,7 +180,7 @@ define <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) { define amdgpu_kernel void @flat_atomic_fadd_v2bf16_noret(ptr %ptr, <2 x i16> %data) { ; GFX12-SDAG-LABEL: flat_atomic_fadd_v2bf16_noret: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2 @@ -173,7 +189,7 @@ define amdgpu_kernel void @flat_atomic_fadd_v2bf16_noret(ptr %ptr, <2 x i16> %da ; ; GFX12-GISEL-LABEL: flat_atomic_fadd_v2bf16_noret: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2 @@ -212,7 +228,7 @@ define <2 x i16> @flat_atomic_fadd_v2bf16_rtn(ptr %ptr, <2 x i16> %data) { define amdgpu_kernel void @global_atomic_fadd_v2bf16_noret(ptr addrspace(1) %ptr, <2 x i16> %data) { ; GFX12-SDAG-LABEL: global_atomic_fadd_v2bf16_noret: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: global_atomic_pk_add_bf16 v0, v1, s[0:1] @@ -222,7 +238,7 @@ define amdgpu_kernel void @global_atomic_fadd_v2bf16_noret(ptr addrspace(1) %ptr ; ; GFX12-GISEL-LABEL: global_atomic_fadd_v2bf16_noret: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-GISEL-NEXT: global_atomic_pk_add_bf16 v1, v0, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll index f55e9f4821b47..988b23d0fe1cc 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll @@ -14,17 +14,17 @@ declare <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> % define amdgpu_kernel void @flat_atomic_fadd_f32_noret(ptr %ptr, float %data) { ; GFX940-LABEL: flat_atomic_fadd_f32_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 ; GFX940-NEXT: s_endpgm ; ; GFX12-LABEL: flat_atomic_fadd_f32_noret: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_mov_b32_e32 v2, s2 @@ -37,7 +37,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret(ptr %ptr, float %data) { define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) { ; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -49,7 +49,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) { ; ; GFX12-LABEL: flat_atomic_fadd_f32_noret_pat: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: s_mov_b32 s0, 0 @@ -58,7 +58,8 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) { ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_add_f32_e32 v2, 4.0, v3 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -76,7 +77,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) { define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 { ; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat_ieee: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -88,7 +89,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 { ; ; GFX12-LABEL: flat_atomic_fadd_f32_noret_pat_ieee: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: s_mov_b32 s0, 0 @@ -97,7 +98,8 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 { ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_add_f32_e32 v2, 4.0, v3 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -160,8 +162,9 @@ define float @flat_atomic_fadd_f32_rtn_pat(ptr %ptr, float %data) { ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_f32_e32 v2, 4.0, v3 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -180,17 +183,17 @@ define float @flat_atomic_fadd_f32_rtn_pat(ptr %ptr, float %data) { define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(ptr %ptr, <2 x half> %data) { ; GFX940-LABEL: flat_atomic_fadd_v2f16_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 ; GFX940-NEXT: s_endpgm ; ; GFX12-LABEL: flat_atomic_fadd_v2f16_noret: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_mov_b32_e32 v2, s2 @@ -225,17 +228,17 @@ define <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) { define amdgpu_kernel void @flat_atomic_fadd_v2bf16_noret(ptr %ptr, <2 x i16> %data) { ; GFX940-LABEL: flat_atomic_fadd_v2bf16_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 ; GFX940-NEXT: s_endpgm ; ; GFX12-LABEL: flat_atomic_fadd_v2bf16_noret: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_mov_b32_e32 v2, s2 @@ -270,17 +273,17 @@ define <2 x i16> @flat_atomic_fadd_v2bf16_rtn(ptr %ptr, <2 x i16> %data) { define amdgpu_kernel void @global_atomic_fadd_v2bf16_noret(ptr addrspace(1) %ptr, <2 x i16> %data) { ; GFX940-LABEL: global_atomic_fadd_v2bf16_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX940-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v1, s[2:3] +; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v1, s[0:1] ; GFX940-NEXT: s_endpgm ; ; GFX12-LABEL: global_atomic_fadd_v2bf16_noret: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v1, s[0:1] @@ -316,19 +319,23 @@ define <2 x i16> @global_atomic_fadd_v2bf16_rtn(ptr addrspace(1) %ptr, <2 x i16> define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, <2 x half> %data) { ; GFX940-LABEL: local_atomic_fadd_v2f16_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s2 -; GFX940-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NEXT: v_mov_b32_e32 v1, s1 ; GFX940-NEXT: ds_pk_add_f16 v0, v1 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: s_endpgm ; ; GFX12-LABEL: local_atomic_fadd_v2f16_noret: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: ds_pk_add_f16 v0, v1 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0) ret void @@ -349,8 +356,11 @@ define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half> ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_pk_add_rtn_f16 v0, v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0) ret <2 x half> %ret @@ -359,24 +369,23 @@ define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half> define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, <2 x i16> %data) { ; GFX940-LABEL: local_atomic_fadd_v2bf16_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s2 -; GFX940-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NEXT: v_mov_b32_e32 v1, s1 ; GFX940-NEXT: ds_pk_add_bf16 v0, v1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: s_endpgm ; ; GFX12-LABEL: local_atomic_fadd_v2bf16_noret: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: ds_pk_add_bf16 v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data) ret void @@ -386,10 +395,8 @@ define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16> ; GFX940-LABEL: local_atomic_fadd_v2bf16_rtn: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: local_atomic_fadd_v2bf16_rtn: @@ -399,10 +406,11 @@ define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16> ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data) ret <2 x i16> %ret diff --git a/llvm/test/CodeGen/AMDGPU/fp-classify.ll b/llvm/test/CodeGen/AMDGPU/fp-classify.ll index 18d2e52e8f900..fb731cc00d3f0 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-classify.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-classify.ll @@ -9,24 +9,24 @@ declare double @llvm.fabs.f64(double) #1 define amdgpu_kernel void @test_isinf_pattern(ptr addrspace(1) nocapture %out, float %x) #0 { ; SI-LABEL: test_isinf_pattern: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x204 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_class_f32_e32 vcc, s0, v0 +; SI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_isinf_pattern: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x204 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0 +; VI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -36,11 +36,11 @@ define amdgpu_kernel void @test_isinf_pattern(ptr addrspace(1) nocapture %out, f ; GFX11-LABEL: test_isinf_pattern: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x204 +; GFX11-NEXT: v_cmp_class_f32_e64 s2, s4, 0x204 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -57,24 +57,24 @@ define amdgpu_kernel void @test_isinf_pattern(ptr addrspace(1) nocapture %out, f define amdgpu_kernel void @test_not_isinf_pattern_0(ptr addrspace(1) nocapture %out, float %x) #0 { ; SI-LABEL: test_not_isinf_pattern_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x7f800000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_nlg_f32_e64 s[0:1], |s0|, v0 -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: v_cmp_nlg_f32_e64 s[4:5], |s4|, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_not_isinf_pattern_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x7f800000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_nlg_f32_e64 s[2:3], |s2|, v0 +; VI-NEXT: v_cmp_nlg_f32_e64 s[2:3], |s4|, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3] ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -84,11 +84,11 @@ define amdgpu_kernel void @test_not_isinf_pattern_0(ptr addrspace(1) nocapture % ; GFX11-LABEL: test_not_isinf_pattern_0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nlg_f32_e64 s2, 0x7f800000, |s2| +; GFX11-NEXT: v_cmp_nlg_f32_e64 s2, 0x7f800000, |s4| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -105,7 +105,7 @@ define amdgpu_kernel void @test_not_isinf_pattern_0(ptr addrspace(1) nocapture % define amdgpu_kernel void @test_not_isinf_pattern_1(ptr addrspace(1) nocapture %out, float %x) #0 { ; SI-LABEL: test_not_isinf_pattern_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -115,7 +115,7 @@ define amdgpu_kernel void @test_not_isinf_pattern_1(ptr addrspace(1) nocapture % ; ; VI-LABEL: test_not_isinf_pattern_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -125,7 +125,7 @@ define amdgpu_kernel void @test_not_isinf_pattern_1(ptr addrspace(1) nocapture % ; ; GFX11-LABEL: test_not_isinf_pattern_1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v0, s[0:1] @@ -142,24 +142,24 @@ define amdgpu_kernel void @test_not_isinf_pattern_1(ptr addrspace(1) nocapture % define amdgpu_kernel void @test_isfinite_pattern_0(ptr addrspace(1) nocapture %out, float %x) #0 { ; SI-LABEL: test_isfinite_pattern_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_class_f32_e32 vcc, s0, v0 +; SI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_isfinite_pattern_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0 +; VI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -169,11 +169,11 @@ define amdgpu_kernel void @test_isfinite_pattern_0(ptr addrspace(1) nocapture %o ; GFX11-LABEL: test_isfinite_pattern_0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x1f8 +; GFX11-NEXT: v_cmp_class_f32_e64 s2, s4, 0x1f8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -192,24 +192,24 @@ define amdgpu_kernel void @test_isfinite_pattern_0(ptr addrspace(1) nocapture %o define amdgpu_kernel void @test_isfinite_pattern_1(ptr addrspace(1) nocapture %out, float %x) #0 { ; SI-LABEL: test_isfinite_pattern_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_class_f32_e32 vcc, s0, v0 +; SI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_isfinite_pattern_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0 +; VI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -219,11 +219,11 @@ define amdgpu_kernel void @test_isfinite_pattern_1(ptr addrspace(1) nocapture %o ; GFX11-LABEL: test_isfinite_pattern_1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x1f8 +; GFX11-NEXT: v_cmp_class_f32_e64 s2, s4, 0x1f8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -241,8 +241,8 @@ define amdgpu_kernel void @test_isfinite_pattern_1(ptr addrspace(1) nocapture %o define amdgpu_kernel void @test_isfinite_not_pattern_0(ptr addrspace(1) nocapture %out, float %x) #0 { ; SI-LABEL: test_isfinite_not_pattern_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -253,10 +253,10 @@ define amdgpu_kernel void @test_isfinite_not_pattern_0(ptr addrspace(1) nocaptur ; ; VI-LABEL: test_isfinite_not_pattern_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_o_f32_e64 s[2:3], s2, s2 +; VI-NEXT: v_cmp_o_f32_e64 s[2:3], s4, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3] ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -266,11 +266,11 @@ define amdgpu_kernel void @test_isfinite_not_pattern_0(ptr addrspace(1) nocaptur ; GFX11-LABEL: test_isfinite_not_pattern_0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_o_f32_e64 s2, s2, s2 +; GFX11-NEXT: v_cmp_o_f32_e64 s2, s4, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -290,23 +290,23 @@ define amdgpu_kernel void @test_isfinite_not_pattern_0(ptr addrspace(1) nocaptur define amdgpu_kernel void @test_isfinite_not_pattern_1(ptr addrspace(1) nocapture %out, float %x) #0 { ; SI-LABEL: test_isfinite_not_pattern_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[2:3], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x7f800000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_o_f32_e64 s[0:1], s2, s2 -; SI-NEXT: v_cmp_neq_f32_e32 vcc, s2, v0 -; SI-NEXT: s_and_b64 s[0:1], s[0:1], vcc -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: v_cmp_o_f32_e64 s[4:5], s6, s6 +; SI-NEXT: v_cmp_neq_f32_e32 vcc, s6, v0 +; SI-NEXT: s_and_b64 s[4:5], s[4:5], vcc +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_isfinite_not_pattern_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x7f800000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cmp_o_f32_e64 s[2:3], s4, s4 @@ -321,14 +321,14 @@ define amdgpu_kernel void @test_isfinite_not_pattern_1(ptr addrspace(1) nocaptur ; GFX11-LABEL: test_isfinite_not_pattern_1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_o_f32_e64 s3, s2, s2 -; GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x7f800000, s2 +; GFX11-NEXT: v_cmp_o_f32_e64 s2, s4, s4 +; GFX11-NEXT: v_cmp_neq_f32_e64 s3, 0x7f800000, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 s2, s3, s2 +; GFX11-NEXT: s_and_b32 s2, s2, s3 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -346,7 +346,7 @@ define amdgpu_kernel void @test_isfinite_not_pattern_1(ptr addrspace(1) nocaptur define amdgpu_kernel void @test_isfinite_not_pattern_2(ptr addrspace(1) nocapture %out, float %x, float %y) #0 { ; SI-LABEL: test_isfinite_not_pattern_2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x7f800000 @@ -362,7 +362,7 @@ define amdgpu_kernel void @test_isfinite_not_pattern_2(ptr addrspace(1) nocaptur ; ; VI-LABEL: test_isfinite_not_pattern_2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x7f800000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cmp_o_f32_e64 s[4:5], s2, s2 @@ -376,7 +376,7 @@ define amdgpu_kernel void @test_isfinite_not_pattern_2(ptr addrspace(1) nocaptur ; ; GFX11-LABEL: test_isfinite_not_pattern_2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_o_f32_e64 s2, s2, s2 @@ -401,23 +401,23 @@ define amdgpu_kernel void @test_isfinite_not_pattern_2(ptr addrspace(1) nocaptur define amdgpu_kernel void @test_isfinite_not_pattern_3(ptr addrspace(1) nocapture %out, float %x) #0 { ; SI-LABEL: test_isfinite_not_pattern_3: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[2:3], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x7f800000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_u_f32_e64 s[0:1], s2, s2 -; SI-NEXT: v_cmp_neq_f32_e64 s[2:3], |s2|, v0 -; SI-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: v_cmp_u_f32_e64 s[4:5], s6, s6 +; SI-NEXT: v_cmp_neq_f32_e64 s[6:7], |s6|, v0 +; SI-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_isfinite_not_pattern_3: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x7f800000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cmp_u_f32_e64 s[2:3], s4, s4 @@ -432,14 +432,14 @@ define amdgpu_kernel void @test_isfinite_not_pattern_3(ptr addrspace(1) nocaptur ; GFX11-LABEL: test_isfinite_not_pattern_3: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_u_f32_e64 s3, s2, s2 -; GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x7f800000, |s2| +; GFX11-NEXT: v_cmp_u_f32_e64 s2, s4, s4 +; GFX11-NEXT: v_cmp_neq_f32_e64 s3, 0x7f800000, |s4| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 s2, s3, s2 +; GFX11-NEXT: s_and_b32 s2, s2, s3 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -457,24 +457,24 @@ define amdgpu_kernel void @test_isfinite_not_pattern_3(ptr addrspace(1) nocaptur define amdgpu_kernel void @test_isfinite_pattern_4(ptr addrspace(1) nocapture %out, float %x) #0 { ; SI-LABEL: test_isfinite_pattern_4: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_class_f32_e32 vcc, s0, v0 +; SI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_isfinite_pattern_4: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0 +; VI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -484,11 +484,11 @@ define amdgpu_kernel void @test_isfinite_pattern_4(ptr addrspace(1) nocapture %o ; GFX11-LABEL: test_isfinite_pattern_4: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x1f8 +; GFX11-NEXT: v_cmp_class_f32_e64 s2, s4, 0x1f8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -507,24 +507,24 @@ define amdgpu_kernel void @test_isfinite_pattern_4(ptr addrspace(1) nocapture %o define amdgpu_kernel void @test_isfinite_pattern_4_commute_and(ptr addrspace(1) nocapture %out, float %x) #0 { ; SI-LABEL: test_isfinite_pattern_4_commute_and: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_class_f32_e32 vcc, s0, v0 +; SI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_isfinite_pattern_4_commute_and: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0 +; VI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -534,11 +534,11 @@ define amdgpu_kernel void @test_isfinite_pattern_4_commute_and(ptr addrspace(1) ; GFX11-LABEL: test_isfinite_pattern_4_commute_and: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x1f8 +; GFX11-NEXT: v_cmp_class_f32_e64 s2, s4, 0x1f8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -557,16 +557,16 @@ define amdgpu_kernel void @test_isfinite_pattern_4_commute_and(ptr addrspace(1) define amdgpu_kernel void @test_not_isfinite_pattern_4_wrong_ord_test(ptr addrspace(1) nocapture %out, float %x, [8 x i32], float %y) #0 { ; SI-LABEL: test_not_isfinite_pattern_4_wrong_ord_test: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0x14 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_load_dword s0, s[2:3], 0x14 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-NEXT: s_load_dword s1, s[2:3], 0xb ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s2 -; SI-NEXT: v_cmp_o_f32_e32 vcc, s0, v1 -; SI-NEXT: v_cmp_class_f32_e64 s[0:1], s0, v0 +; SI-NEXT: v_mov_b32_e32 v1, s0 +; SI-NEXT: v_cmp_o_f32_e32 vcc, s1, v1 +; SI-NEXT: v_cmp_class_f32_e64 s[0:1], s1, v0 ; SI-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -574,14 +574,14 @@ define amdgpu_kernel void @test_not_isfinite_pattern_4_wrong_ord_test(ptr addrsp ; ; VI-LABEL: test_not_isfinite_pattern_4_wrong_ord_test: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x50 -; VI-NEXT: s_load_dword s5, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[2:3], 0x50 +; VI-NEXT: s_load_dword s1, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_cmp_class_f32_e32 vcc, s5, v0 -; VI-NEXT: v_cmp_o_f32_e64 s[0:1], s5, v1 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_cmp_class_f32_e32 vcc, s1, v0 +; VI-NEXT: v_cmp_o_f32_e64 s[0:1], s1, v1 ; VI-NEXT: s_and_b64 s[0:1], s[0:1], vcc ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] @@ -592,15 +592,15 @@ define amdgpu_kernel void @test_not_isfinite_pattern_4_wrong_ord_test(ptr addrsp ; GFX11-LABEL: test_not_isfinite_pattern_4_wrong_ord_test: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x50 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x50 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_o_f32_e64 s3, s2, s3 -; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x1f8 +; GFX11-NEXT: v_cmp_class_f32_e64 s3, s4, 0x1f8 +; GFX11-NEXT: v_cmp_o_f32_e64 s2, s4, s5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 s2, s3, s2 +; GFX11-NEXT: s_and_b32 s2, s2, s3 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -618,8 +618,8 @@ define amdgpu_kernel void @test_not_isfinite_pattern_4_wrong_ord_test(ptr addrsp define amdgpu_kernel void @test_isinf_pattern_f16(ptr addrspace(1) nocapture %out, half %x) #0 { ; SI-LABEL: test_isinf_pattern_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -632,11 +632,11 @@ define amdgpu_kernel void @test_isinf_pattern_f16(ptr addrspace(1) nocapture %ou ; ; VI-LABEL: test_isinf_pattern_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x204 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_class_f16_e32 vcc, s2, v0 +; VI-NEXT: v_cmp_class_f16_e32 vcc, s4, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -646,11 +646,11 @@ define amdgpu_kernel void @test_isinf_pattern_f16(ptr addrspace(1) nocapture %ou ; GFX11-LABEL: test_isinf_pattern_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_class_f16_e64 s2, s2, 0x204 +; GFX11-NEXT: v_cmp_class_f16_e64 s2, s4, 0x204 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -667,8 +667,8 @@ define amdgpu_kernel void @test_isinf_pattern_f16(ptr addrspace(1) nocapture %ou define amdgpu_kernel void @test_isfinite_pattern_0_f16(ptr addrspace(1) nocapture %out, half %x) #0 { ; SI-LABEL: test_isfinite_pattern_0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -684,11 +684,11 @@ define amdgpu_kernel void @test_isfinite_pattern_0_f16(ptr addrspace(1) nocaptur ; ; VI-LABEL: test_isfinite_pattern_0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_class_f16_e32 vcc, s2, v0 +; VI-NEXT: v_cmp_class_f16_e32 vcc, s4, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -698,11 +698,11 @@ define amdgpu_kernel void @test_isfinite_pattern_0_f16(ptr addrspace(1) nocaptur ; GFX11-LABEL: test_isfinite_pattern_0_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_class_f16_e64 s2, s2, 0x1f8 +; GFX11-NEXT: v_cmp_class_f16_e64 s2, s4, 0x1f8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -721,8 +721,8 @@ define amdgpu_kernel void @test_isfinite_pattern_0_f16(ptr addrspace(1) nocaptur define amdgpu_kernel void @test_isfinite_pattern_4_f16(ptr addrspace(1) nocapture %out, half %x) #0 { ; SI-LABEL: test_isfinite_pattern_4_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -738,11 +738,11 @@ define amdgpu_kernel void @test_isfinite_pattern_4_f16(ptr addrspace(1) nocaptur ; ; VI-LABEL: test_isfinite_pattern_4_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_class_f16_e32 vcc, s2, v0 +; VI-NEXT: v_cmp_class_f16_e32 vcc, s4, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -752,11 +752,11 @@ define amdgpu_kernel void @test_isfinite_pattern_4_f16(ptr addrspace(1) nocaptur ; GFX11-LABEL: test_isfinite_pattern_4_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_class_f16_e64 s2, s2, 0x1f8 +; GFX11-NEXT: v_cmp_class_f16_e64 s2, s4, 0x1f8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll index a6fadbed33c86..742aeb96fcc2c 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll @@ -20,8 +20,8 @@ declare float @llvm.amdgcn.raw.buffer.atomic.fmax.f32(float, <4 x i32>, i32, i32 define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(<4 x i32> inreg %rsrc, float %data, i32 %vindex) { ; SI-LABEL: raw_buffer_atomic_min_noret_f32: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -30,8 +30,8 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(<4 x i32> inreg %rsrc ; ; GFX7-LABEL: raw_buffer_atomic_min_noret_f32: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -41,19 +41,19 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(<4 x i32> inreg %rsrc ; GFX10-LABEL: raw_buffer_atomic_min_noret_f32: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen ; GFX10-NEXT: s_endpgm ; ; GFX1030-LABEL: raw_buffer_atomic_min_noret_f32: ; GFX1030: ; %bb.0: ; %main_body ; GFX1030-NEXT: s_clause 0x1 -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -63,8 +63,8 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(<4 x i32> inreg %rsrc ; GFX1100-LABEL: raw_buffer_atomic_min_noret_f32: ; GFX1100: ; %bb.0: ; %main_body ; GFX1100-NEXT: s_clause 0x1 -; GFX1100-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1100-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX1100-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen @@ -75,8 +75,8 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(<4 x i32> inreg %rsrc ; GFX12-LABEL: raw_buffer_atomic_min_noret_f32: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen @@ -86,8 +86,8 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(<4 x i32> inreg %rsrc ; ; G_SI-LABEL: raw_buffer_atomic_min_noret_f32: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; G_SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; G_SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; G_SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v0, s4 ; G_SI-NEXT: v_mov_b32_e32 v1, s5 @@ -96,8 +96,8 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(<4 x i32> inreg %rsrc ; ; G_GFX7-LABEL: raw_buffer_atomic_min_noret_f32: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -107,19 +107,19 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(<4 x i32> inreg %rsrc ; G_GFX10-LABEL: raw_buffer_atomic_min_noret_f32: ; G_GFX10: ; %bb.0: ; %main_body ; G_GFX10-NEXT: s_clause 0x1 -; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s2 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s3 +; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 ; G_GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen ; G_GFX10-NEXT: s_endpgm ; ; G_GFX1030-LABEL: raw_buffer_atomic_min_noret_f32: ; G_GFX1030: ; %bb.0: ; %main_body ; G_GFX1030-NEXT: s_clause 0x1 -; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -129,8 +129,8 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(<4 x i32> inreg %rsrc ; G_GFX1100-LABEL: raw_buffer_atomic_min_noret_f32: ; G_GFX1100: ; %bb.0: ; %main_body ; G_GFX1100-NEXT: s_clause 0x1 -; G_GFX1100-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; G_GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; G_GFX1100-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; G_GFX1100-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; G_GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen @@ -242,14 +242,15 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f32_off4_slc(<4 x i32> inreg %rsrc, float %data, i32 %vindex, ptr addrspace(3) %out) { ; SI-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xf +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc +; SI-NEXT: s_load_dword s0, s[2:3], 0xf +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: ds_write_b32 v1, v0 @@ -257,60 +258,48 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f32_off4_slc(<4 x i32> inre ; ; GFX7-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dword s0, s[0:1], 0xf +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc -; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 4 offen glc slc +; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_write_b32 v1, v0 ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: s_load_dword s0, s[0:1], 0x3c +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: v_mov_b32_e32 v1, s9 ; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s10 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ds_write_b32 v1, v0 ; GFX10-NEXT: s_endpgm ; ; GFX1030-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_clause 0x2 -; GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX1030-NEXT: s_load_dword s0, s[0:1], 0x3c +; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v0, s2 -; GFX1030-NEXT: v_mov_b32_e32 v1, s3 -; GFX1030-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc -; GFX1030-NEXT: v_mov_b32_e32 v1, s0 +; GFX1030-NEXT: v_mov_b32_e32 v0, s4 +; GFX1030-NEXT: v_mov_b32_e32 v1, s5 +; GFX1030-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 4 offen glc slc +; GFX1030-NEXT: v_mov_b32_e32 v1, s6 ; GFX1030-NEXT: s_waitcnt vmcnt(0) ; GFX1030-NEXT: ds_write_b32 v1, v0 ; GFX1030-NEXT: s_endpgm ; ; GFX1100-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; GFX1100: ; %bb.0: ; %main_body -; GFX1100-NEXT: s_clause 0x2 -; GFX1100-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 -; GFX1100-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX1100-NEXT: s_load_b32 s0, s[0:1], 0x3c +; GFX1100-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX1100-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[4:7], 4 offen glc slc -; GFX1100-NEXT: v_mov_b32_e32 v1, s0 +; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 4 offen glc slc +; GFX1100-NEXT: v_mov_b32_e32 v1, s6 ; GFX1100-NEXT: s_waitcnt vmcnt(0) ; GFX1100-NEXT: ds_store_b32 v1, v0 ; GFX1100-NEXT: s_endpgm @@ -318,8 +307,8 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f32_off4_slc(<4 x i32> inre ; GFX12-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b96 s[4:6], s[0:1], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b96 s[4:6], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: s_mov_b32 s4, 4 @@ -331,14 +320,15 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f32_off4_slc(<4 x i32> inre ; ; G_SI-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; G_SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; G_SI-NEXT: s_load_dword s0, s[0:1], 0xf +; G_SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; G_SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; G_SI-NEXT: s_mov_b32 m0, -1 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) -; G_SI-NEXT: v_mov_b32_e32 v0, s2 -; G_SI-NEXT: v_mov_b32_e32 v1, s3 +; G_SI-NEXT: v_mov_b32_e32 v0, s0 +; G_SI-NEXT: v_mov_b32_e32 v1, s1 ; G_SI-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc +; G_SI-NEXT: s_load_dword s0, s[2:3], 0xf +; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v1, s0 ; G_SI-NEXT: s_waitcnt vmcnt(0) ; G_SI-NEXT: ds_write_b32 v1, v0 @@ -346,14 +336,15 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f32_off4_slc(<4 x i32> inre ; ; G_GFX7-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; G_GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; G_GFX7-NEXT: s_load_dword s0, s[0:1], 0xf +; G_GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; G_GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; G_GFX7-NEXT: s_mov_b32 m0, -1 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX7-NEXT: v_mov_b32_e32 v0, s2 -; G_GFX7-NEXT: v_mov_b32_e32 v1, s3 +; G_GFX7-NEXT: v_mov_b32_e32 v0, s0 +; G_GFX7-NEXT: v_mov_b32_e32 v1, s1 ; G_GFX7-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc +; G_GFX7-NEXT: s_load_dword s0, s[2:3], 0xf +; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v1, s0 ; G_GFX7-NEXT: s_waitcnt vmcnt(0) ; G_GFX7-NEXT: ds_write_b32 v1, v0 @@ -362,12 +353,12 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f32_off4_slc(<4 x i32> inre ; G_GFX10-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; G_GFX10: ; %bb.0: ; %main_body ; G_GFX10-NEXT: s_clause 0x1 -; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s2 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s3 -; G_GFX10-NEXT: s_load_dword s0, s[0:1], 0x3c +; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 +; G_GFX10-NEXT: s_load_dword s0, s[2:3], 0x3c ; G_GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX10-NEXT: v_mov_b32_e32 v1, s0 @@ -377,14 +368,15 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f32_off4_slc(<4 x i32> inre ; ; G_GFX1030-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; G_GFX1030: ; %bb.0: ; %main_body -; G_GFX1030-NEXT: s_clause 0x2 -; G_GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; G_GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; G_GFX1030-NEXT: s_load_dword s0, s[0:1], 0x3c +; G_GFX1030-NEXT: s_clause 0x1 +; G_GFX1030-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; G_GFX1030-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX1030-NEXT: v_mov_b32_e32 v0, s2 -; G_GFX1030-NEXT: v_mov_b32_e32 v1, s3 +; G_GFX1030-NEXT: v_mov_b32_e32 v0, s0 +; G_GFX1030-NEXT: v_mov_b32_e32 v1, s1 +; G_GFX1030-NEXT: s_load_dword s0, s[2:3], 0x3c ; G_GFX1030-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc +; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s0 ; G_GFX1030-NEXT: s_waitcnt vmcnt(0) ; G_GFX1030-NEXT: ds_write_b32 v1, v0 @@ -392,13 +384,14 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f32_off4_slc(<4 x i32> inre ; ; G_GFX1100-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; G_GFX1100: ; %bb.0: ; %main_body -; G_GFX1100-NEXT: s_clause 0x2 -; G_GFX1100-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 -; G_GFX1100-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; G_GFX1100-NEXT: s_load_b32 s0, s[0:1], 0x3c +; G_GFX1100-NEXT: s_clause 0x1 +; G_GFX1100-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; G_GFX1100-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 ; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX1100-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; G_GFX1100-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; G_GFX1100-NEXT: s_load_b32 s0, s[2:3], 0x3c ; G_GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[4:7], 4 offen glc slc +; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1100-NEXT: v_mov_b32_e32 v1, s0 ; G_GFX1100-NEXT: s_waitcnt vmcnt(0) ; G_GFX1100-NEXT: ds_store_b32 v1, v0 @@ -412,8 +405,8 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(<4 x i32> inreg %rsrc, float %data, i32 %vindex) { ; SI-LABEL: raw_buffer_atomic_max_noret_f32: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -422,8 +415,8 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(<4 x i32> inreg %rsrc ; ; GFX7-LABEL: raw_buffer_atomic_max_noret_f32: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -433,19 +426,19 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(<4 x i32> inreg %rsrc ; GFX10-LABEL: raw_buffer_atomic_max_noret_f32: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen ; GFX10-NEXT: s_endpgm ; ; GFX1030-LABEL: raw_buffer_atomic_max_noret_f32: ; GFX1030: ; %bb.0: ; %main_body ; GFX1030-NEXT: s_clause 0x1 -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -455,8 +448,8 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(<4 x i32> inreg %rsrc ; GFX1100-LABEL: raw_buffer_atomic_max_noret_f32: ; GFX1100: ; %bb.0: ; %main_body ; GFX1100-NEXT: s_clause 0x1 -; GFX1100-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1100-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX1100-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen @@ -467,8 +460,8 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(<4 x i32> inreg %rsrc ; GFX12-LABEL: raw_buffer_atomic_max_noret_f32: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen @@ -478,8 +471,8 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(<4 x i32> inreg %rsrc ; ; G_SI-LABEL: raw_buffer_atomic_max_noret_f32: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; G_SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; G_SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; G_SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v0, s4 ; G_SI-NEXT: v_mov_b32_e32 v1, s5 @@ -488,8 +481,8 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(<4 x i32> inreg %rsrc ; ; G_GFX7-LABEL: raw_buffer_atomic_max_noret_f32: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -499,19 +492,19 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(<4 x i32> inreg %rsrc ; G_GFX10-LABEL: raw_buffer_atomic_max_noret_f32: ; G_GFX10: ; %bb.0: ; %main_body ; G_GFX10-NEXT: s_clause 0x1 -; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s2 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s3 +; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 ; G_GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen ; G_GFX10-NEXT: s_endpgm ; ; G_GFX1030-LABEL: raw_buffer_atomic_max_noret_f32: ; G_GFX1030: ; %bb.0: ; %main_body ; G_GFX1030-NEXT: s_clause 0x1 -; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -521,8 +514,8 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(<4 x i32> inreg %rsrc ; G_GFX1100-LABEL: raw_buffer_atomic_max_noret_f32: ; G_GFX1100: ; %bb.0: ; %main_body ; G_GFX1100-NEXT: s_clause 0x1 -; G_GFX1100-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; G_GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; G_GFX1100-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; G_GFX1100-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; G_GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen @@ -634,7 +627,7 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inreg %rsrc, float %data, i32 %vindex, ptr addrspace(1) %out) { ; SI-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -649,7 +642,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre ; ; GFX7-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -664,7 +657,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre ; ; GFX10-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-NEXT: v_mov_b32_e32 v1, s9 @@ -676,7 +669,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre ; ; GFX1030-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -688,7 +681,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre ; ; GFX1100-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; GFX1100: ; %bb.0: ; %main_body -; GFX1100-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX1100-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 4 offen glc slc @@ -701,7 +694,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre ; ; GFX12-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; GFX12: ; %bb.0: ; %main_body -; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: s_mov_b32 s4, 4 @@ -715,7 +708,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre ; ; G_SI-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; G_SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v0, s4 ; G_SI-NEXT: v_mov_b32_e32 v1, s5 @@ -729,7 +722,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre ; ; G_GFX7-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; G_GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -743,7 +736,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre ; ; G_GFX10-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; G_GFX10: ; %bb.0: ; %main_body -; G_GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; G_GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX10-NEXT: v_mov_b32_e32 v0, s8 ; G_GFX10-NEXT: v_mov_b32_e32 v1, s9 @@ -755,7 +748,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre ; ; G_GFX1030-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; G_GFX1030: ; %bb.0: ; %main_body -; G_GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; G_GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -767,7 +760,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre ; ; G_GFX1100-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; G_GFX1100: ; %bb.0: ; %main_body -; G_GFX1100-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; G_GFX1100-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; G_GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 4 offen glc slc diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll index ea26ad06cbb16..950d228f29929 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll @@ -18,8 +18,8 @@ declare float @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f32(float, ptr addrspace(8 define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f32(ptr addrspace(8) inreg %rsrc, float %data, i32 %vindex) { ; SI-LABEL: raw_ptr_buffer_atomic_min_noret_f32: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -28,8 +28,8 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f32(ptr addrspace(8) ; ; GFX7-LABEL: raw_ptr_buffer_atomic_min_noret_f32: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -39,19 +39,19 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f32(ptr addrspace(8) ; GFX10-LABEL: raw_ptr_buffer_atomic_min_noret_f32: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen ; GFX10-NEXT: s_endpgm ; ; GFX1030-LABEL: raw_ptr_buffer_atomic_min_noret_f32: ; GFX1030: ; %bb.0: ; %main_body ; GFX1030-NEXT: s_clause 0x1 -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -61,8 +61,8 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f32(ptr addrspace(8) ; GFX1100-LABEL: raw_ptr_buffer_atomic_min_noret_f32: ; GFX1100: ; %bb.0: ; %main_body ; GFX1100-NEXT: s_clause 0x1 -; GFX1100-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1100-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX1100-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen @@ -72,8 +72,8 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f32(ptr addrspace(8) ; ; G_SI-LABEL: raw_ptr_buffer_atomic_min_noret_f32: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; G_SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; G_SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; G_SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v0, s4 ; G_SI-NEXT: v_mov_b32_e32 v1, s5 @@ -82,8 +82,8 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f32(ptr addrspace(8) ; ; G_GFX7-LABEL: raw_ptr_buffer_atomic_min_noret_f32: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -93,19 +93,19 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f32(ptr addrspace(8) ; G_GFX10-LABEL: raw_ptr_buffer_atomic_min_noret_f32: ; G_GFX10: ; %bb.0: ; %main_body ; G_GFX10-NEXT: s_clause 0x1 -; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s2 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s3 +; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 ; G_GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen ; G_GFX10-NEXT: s_endpgm ; ; G_GFX1030-LABEL: raw_ptr_buffer_atomic_min_noret_f32: ; G_GFX1030: ; %bb.0: ; %main_body ; G_GFX1030-NEXT: s_clause 0x1 -; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -115,8 +115,8 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f32(ptr addrspace(8) ; G_GFX1100-LABEL: raw_ptr_buffer_atomic_min_noret_f32: ; G_GFX1100: ; %bb.0: ; %main_body ; G_GFX1100-NEXT: s_clause 0x1 -; G_GFX1100-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; G_GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; G_GFX1100-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; G_GFX1100-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; G_GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen @@ -219,14 +219,15 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f32_off4_slc(ptr addrspace(8) inreg %rsrc, float %data, i32 %vindex, ptr addrspace(3) %out) { ; SI-LABEL: raw_ptr_buffer_atomic_min_rtn_f32_off4_slc: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xf +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc +; SI-NEXT: s_load_dword s0, s[2:3], 0xf +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: ds_write_b32 v1, v0 @@ -234,74 +235,63 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f32_off4_slc(ptr addrsp ; ; GFX7-LABEL: raw_ptr_buffer_atomic_min_rtn_f32_off4_slc: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dword s0, s[0:1], 0xf +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc -; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 4 offen glc slc +; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_write_b32 v1, v0 ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: raw_ptr_buffer_atomic_min_rtn_f32_off4_slc: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: s_load_dword s0, s[0:1], 0x3c +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: v_mov_b32_e32 v1, s9 ; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s10 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ds_write_b32 v1, v0 ; GFX10-NEXT: s_endpgm ; ; GFX1030-LABEL: raw_ptr_buffer_atomic_min_rtn_f32_off4_slc: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_clause 0x2 -; GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX1030-NEXT: s_load_dword s0, s[0:1], 0x3c +; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v0, s2 -; GFX1030-NEXT: v_mov_b32_e32 v1, s3 -; GFX1030-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc -; GFX1030-NEXT: v_mov_b32_e32 v1, s0 +; GFX1030-NEXT: v_mov_b32_e32 v0, s4 +; GFX1030-NEXT: v_mov_b32_e32 v1, s5 +; GFX1030-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 4 offen glc slc +; GFX1030-NEXT: v_mov_b32_e32 v1, s6 ; GFX1030-NEXT: s_waitcnt vmcnt(0) ; GFX1030-NEXT: ds_write_b32 v1, v0 ; GFX1030-NEXT: s_endpgm ; ; GFX1100-LABEL: raw_ptr_buffer_atomic_min_rtn_f32_off4_slc: ; GFX1100: ; %bb.0: ; %main_body -; GFX1100-NEXT: s_clause 0x2 -; GFX1100-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 -; GFX1100-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX1100-NEXT: s_load_b32 s0, s[0:1], 0x3c +; GFX1100-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX1100-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[4:7], 4 offen glc slc -; GFX1100-NEXT: v_mov_b32_e32 v1, s0 +; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 4 offen glc slc +; GFX1100-NEXT: v_mov_b32_e32 v1, s6 ; GFX1100-NEXT: s_waitcnt vmcnt(0) ; GFX1100-NEXT: ds_store_b32 v1, v0 ; GFX1100-NEXT: s_endpgm ; ; G_SI-LABEL: raw_ptr_buffer_atomic_min_rtn_f32_off4_slc: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; G_SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; G_SI-NEXT: s_load_dword s0, s[0:1], 0xf +; G_SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; G_SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; G_SI-NEXT: s_mov_b32 m0, -1 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) -; G_SI-NEXT: v_mov_b32_e32 v0, s2 -; G_SI-NEXT: v_mov_b32_e32 v1, s3 +; G_SI-NEXT: v_mov_b32_e32 v0, s0 +; G_SI-NEXT: v_mov_b32_e32 v1, s1 ; G_SI-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc +; G_SI-NEXT: s_load_dword s0, s[2:3], 0xf +; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v1, s0 ; G_SI-NEXT: s_waitcnt vmcnt(0) ; G_SI-NEXT: ds_write_b32 v1, v0 @@ -309,14 +299,15 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f32_off4_slc(ptr addrsp ; ; G_GFX7-LABEL: raw_ptr_buffer_atomic_min_rtn_f32_off4_slc: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; G_GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; G_GFX7-NEXT: s_load_dword s0, s[0:1], 0xf +; G_GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; G_GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; G_GFX7-NEXT: s_mov_b32 m0, -1 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX7-NEXT: v_mov_b32_e32 v0, s2 -; G_GFX7-NEXT: v_mov_b32_e32 v1, s3 +; G_GFX7-NEXT: v_mov_b32_e32 v0, s0 +; G_GFX7-NEXT: v_mov_b32_e32 v1, s1 ; G_GFX7-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc +; G_GFX7-NEXT: s_load_dword s0, s[2:3], 0xf +; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v1, s0 ; G_GFX7-NEXT: s_waitcnt vmcnt(0) ; G_GFX7-NEXT: ds_write_b32 v1, v0 @@ -325,12 +316,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f32_off4_slc(ptr addrsp ; G_GFX10-LABEL: raw_ptr_buffer_atomic_min_rtn_f32_off4_slc: ; G_GFX10: ; %bb.0: ; %main_body ; G_GFX10-NEXT: s_clause 0x1 -; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s2 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s3 -; G_GFX10-NEXT: s_load_dword s0, s[0:1], 0x3c +; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 +; G_GFX10-NEXT: s_load_dword s0, s[2:3], 0x3c ; G_GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX10-NEXT: v_mov_b32_e32 v1, s0 @@ -340,14 +331,15 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f32_off4_slc(ptr addrsp ; ; G_GFX1030-LABEL: raw_ptr_buffer_atomic_min_rtn_f32_off4_slc: ; G_GFX1030: ; %bb.0: ; %main_body -; G_GFX1030-NEXT: s_clause 0x2 -; G_GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; G_GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; G_GFX1030-NEXT: s_load_dword s0, s[0:1], 0x3c +; G_GFX1030-NEXT: s_clause 0x1 +; G_GFX1030-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; G_GFX1030-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX1030-NEXT: v_mov_b32_e32 v0, s2 -; G_GFX1030-NEXT: v_mov_b32_e32 v1, s3 +; G_GFX1030-NEXT: v_mov_b32_e32 v0, s0 +; G_GFX1030-NEXT: v_mov_b32_e32 v1, s1 +; G_GFX1030-NEXT: s_load_dword s0, s[2:3], 0x3c ; G_GFX1030-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc +; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s0 ; G_GFX1030-NEXT: s_waitcnt vmcnt(0) ; G_GFX1030-NEXT: ds_write_b32 v1, v0 @@ -355,13 +347,14 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f32_off4_slc(ptr addrsp ; ; G_GFX1100-LABEL: raw_ptr_buffer_atomic_min_rtn_f32_off4_slc: ; G_GFX1100: ; %bb.0: ; %main_body -; G_GFX1100-NEXT: s_clause 0x2 -; G_GFX1100-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 -; G_GFX1100-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; G_GFX1100-NEXT: s_load_b32 s0, s[0:1], 0x3c +; G_GFX1100-NEXT: s_clause 0x1 +; G_GFX1100-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; G_GFX1100-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 ; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX1100-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; G_GFX1100-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; G_GFX1100-NEXT: s_load_b32 s0, s[2:3], 0x3c ; G_GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[4:7], 4 offen glc slc +; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1100-NEXT: v_mov_b32_e32 v1, s0 ; G_GFX1100-NEXT: s_waitcnt vmcnt(0) ; G_GFX1100-NEXT: ds_store_b32 v1, v0 @@ -376,8 +369,8 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f32(ptr addrspace(8) inreg %rsrc, float %data, i32 %vindex) { ; SI-LABEL: raw_ptr_buffer_atomic_max_noret_f32: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -386,8 +379,8 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f32(ptr addrspace(8) ; ; GFX7-LABEL: raw_ptr_buffer_atomic_max_noret_f32: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -397,19 +390,19 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f32(ptr addrspace(8) ; GFX10-LABEL: raw_ptr_buffer_atomic_max_noret_f32: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen ; GFX10-NEXT: s_endpgm ; ; GFX1030-LABEL: raw_ptr_buffer_atomic_max_noret_f32: ; GFX1030: ; %bb.0: ; %main_body ; GFX1030-NEXT: s_clause 0x1 -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -419,8 +412,8 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f32(ptr addrspace(8) ; GFX1100-LABEL: raw_ptr_buffer_atomic_max_noret_f32: ; GFX1100: ; %bb.0: ; %main_body ; GFX1100-NEXT: s_clause 0x1 -; GFX1100-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1100-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX1100-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen @@ -430,8 +423,8 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f32(ptr addrspace(8) ; ; G_SI-LABEL: raw_ptr_buffer_atomic_max_noret_f32: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; G_SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; G_SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; G_SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v0, s4 ; G_SI-NEXT: v_mov_b32_e32 v1, s5 @@ -440,8 +433,8 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f32(ptr addrspace(8) ; ; G_GFX7-LABEL: raw_ptr_buffer_atomic_max_noret_f32: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -451,19 +444,19 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f32(ptr addrspace(8) ; G_GFX10-LABEL: raw_ptr_buffer_atomic_max_noret_f32: ; G_GFX10: ; %bb.0: ; %main_body ; G_GFX10-NEXT: s_clause 0x1 -; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s2 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s3 +; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 ; G_GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen ; G_GFX10-NEXT: s_endpgm ; ; G_GFX1030-LABEL: raw_ptr_buffer_atomic_max_noret_f32: ; G_GFX1030: ; %bb.0: ; %main_body ; G_GFX1030-NEXT: s_clause 0x1 -; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -473,8 +466,8 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f32(ptr addrspace(8) ; G_GFX1100-LABEL: raw_ptr_buffer_atomic_max_noret_f32: ; G_GFX1100: ; %bb.0: ; %main_body ; G_GFX1100-NEXT: s_clause 0x1 -; G_GFX1100-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; G_GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; G_GFX1100-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; G_GFX1100-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; G_GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen @@ -577,7 +570,7 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrspace(8) inreg %rsrc, float %data, i32 %vindex, ptr addrspace(1) %out) { ; SI-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -592,7 +585,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrsp ; ; GFX7-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -607,7 +600,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrsp ; ; GFX10-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-NEXT: v_mov_b32_e32 v1, s9 @@ -619,7 +612,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrsp ; ; GFX1030-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -631,7 +624,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrsp ; ; GFX1100-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc: ; GFX1100: ; %bb.0: ; %main_body -; GFX1100-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX1100-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 4 offen glc slc @@ -644,7 +637,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrsp ; ; G_SI-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; G_SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v0, s4 ; G_SI-NEXT: v_mov_b32_e32 v1, s5 @@ -658,7 +651,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrsp ; ; G_GFX7-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; G_GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -672,7 +665,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrsp ; ; G_GFX10-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc: ; G_GFX10: ; %bb.0: ; %main_body -; G_GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; G_GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX10-NEXT: v_mov_b32_e32 v0, s8 ; G_GFX10-NEXT: v_mov_b32_e32 v1, s9 @@ -684,7 +677,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrsp ; ; G_GFX1030-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc: ; G_GFX1030: ; %bb.0: ; %main_body -; G_GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; G_GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -696,7 +689,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrsp ; ; G_GFX1100-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc: ; G_GFX1100: ; %bb.0: ; %main_body -; G_GFX1100-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; G_GFX1100-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; G_GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 4 offen glc slc diff --git a/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll b/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll index d827ea0503a3b..81859dce04889 100644 --- a/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll +++ b/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll @@ -10,7 +10,7 @@ declare float @llvm.convert.from.fp16.f32(i16) nounwind readnone define amdgpu_kernel void @test_convert_fp16_to_fp32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; GFX6-LABEL: test_convert_fp16_to_fp32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -28,7 +28,7 @@ define amdgpu_kernel void @test_convert_fp16_to_fp32(ptr addrspace(1) noalias %o ; ; GFX8-LABEL: test_convert_fp16_to_fp32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_mov_b32 s10, s6 @@ -46,7 +46,7 @@ define amdgpu_kernel void @test_convert_fp16_to_fp32(ptr addrspace(1) noalias %o ; ; GFX11-LABEL: test_convert_fp16_to_fp32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 diff --git a/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll b/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll index 03b8251ea4640..c17be87834aeb 100644 --- a/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll +++ b/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll @@ -8,7 +8,7 @@ declare double @llvm.convert.from.fp16.f64(i16) nounwind readnone define amdgpu_kernel void @test_convert_fp16_to_fp64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; GFX6-LABEL: test_convert_fp16_to_fp64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -27,7 +27,7 @@ define amdgpu_kernel void @test_convert_fp16_to_fp64(ptr addrspace(1) noalias %o ; ; GFX8-LABEL: test_convert_fp16_to_fp64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_mov_b32 s10, s6 @@ -46,7 +46,7 @@ define amdgpu_kernel void @test_convert_fp16_to_fp64(ptr addrspace(1) noalias %o ; ; GFX11-LABEL: test_convert_fp16_to_fp64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 diff --git a/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll b/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll index 8ab82b722445e..d8a726f251a01 100644 --- a/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll +++ b/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll @@ -9,7 +9,7 @@ declare i16 @llvm.convert.to.fp16.f32(float) nounwind readnone define amdgpu_kernel void @test_convert_fp32_to_fp16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; GFX6-LABEL: test_convert_fp32_to_fp16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -27,7 +27,7 @@ define amdgpu_kernel void @test_convert_fp32_to_fp16(ptr addrspace(1) noalias %o ; ; GFX8-LABEL: test_convert_fp32_to_fp16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_mov_b32 s10, s6 @@ -45,7 +45,7 @@ define amdgpu_kernel void @test_convert_fp32_to_fp16(ptr addrspace(1) noalias %o ; ; GFX11-LABEL: test_convert_fp32_to_fp16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 diff --git a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll index 47164c9d32a8f..90c1759070a59 100644 --- a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll @@ -16,9 +16,9 @@ declare double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double, <4 x i32>, i32, i define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { ; SI-LABEL: raw_buffer_atomic_min_noret_f64: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_load_dword s6, s[0:1], 0xf -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dword s6, s[2:3], 0xf +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -28,9 +28,9 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc ; ; GFX7-LABEL: raw_buffer_atomic_min_noret_f64: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GFX7-NEXT: s_load_dword s6, s[0:1], 0xf -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GFX7-NEXT: s_load_dword s6, s[2:3], 0xf +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -41,12 +41,12 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc ; GFX10-LABEL: raw_buffer_atomic_min_noret_f64: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-NEXT: s_load_dword s8, s[0:1], 0x3c -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s8 ; GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen ; GFX10-NEXT: s_endpgm @@ -54,9 +54,9 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc ; GFX1030-LABEL: raw_buffer_atomic_min_noret_f64: ; GFX1030: ; %bb.0: ; %main_body ; GFX1030-NEXT: s_clause 0x2 -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX1030-NEXT: s_load_dword s6, s[2:3], 0x3c +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -66,9 +66,9 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc ; ; G_SI-LABEL: raw_buffer_atomic_min_noret_f64: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; G_SI-NEXT: s_load_dword s6, s[0:1], 0xf -; G_SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; G_SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; G_SI-NEXT: s_load_dword s6, s[2:3], 0xf +; G_SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v0, s4 ; G_SI-NEXT: v_mov_b32_e32 v1, s5 @@ -78,9 +78,9 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc ; ; G_GFX7-LABEL: raw_buffer_atomic_min_noret_f64: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; G_GFX7-NEXT: s_load_dword s6, s[0:1], 0xf -; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; G_GFX7-NEXT: s_load_dword s6, s[2:3], 0xf +; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -91,12 +91,12 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc ; G_GFX10-LABEL: raw_buffer_atomic_min_noret_f64: ; G_GFX10: ; %bb.0: ; %main_body ; G_GFX10-NEXT: s_clause 0x2 -; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; G_GFX10-NEXT: s_load_dword s8, s[0:1], 0x3c -; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; G_GFX10-NEXT: s_load_dword s8, s[2:3], 0x3c +; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s2 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s3 +; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 ; G_GFX10-NEXT: v_mov_b32_e32 v2, s8 ; G_GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen ; G_GFX10-NEXT: s_endpgm @@ -104,9 +104,9 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc ; G_GFX1030-LABEL: raw_buffer_atomic_min_noret_f64: ; G_GFX1030: ; %bb.0: ; %main_body ; G_GFX1030-NEXT: s_clause 0x2 -; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; G_GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c -; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; G_GFX1030-NEXT: s_load_dword s6, s[2:3], 0x3c +; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -253,9 +253,9 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { ; SI-LABEL: raw_buffer_atomic_max_noret_f64: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_load_dword s6, s[0:1], 0xf -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dword s6, s[2:3], 0xf +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -265,9 +265,9 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc ; ; GFX7-LABEL: raw_buffer_atomic_max_noret_f64: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GFX7-NEXT: s_load_dword s6, s[0:1], 0xf -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GFX7-NEXT: s_load_dword s6, s[2:3], 0xf +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -278,12 +278,12 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc ; GFX10-LABEL: raw_buffer_atomic_max_noret_f64: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-NEXT: s_load_dword s8, s[0:1], 0x3c -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s8 ; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen ; GFX10-NEXT: s_endpgm @@ -291,9 +291,9 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc ; GFX1030-LABEL: raw_buffer_atomic_max_noret_f64: ; GFX1030: ; %bb.0: ; %main_body ; GFX1030-NEXT: s_clause 0x2 -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX1030-NEXT: s_load_dword s6, s[2:3], 0x3c +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -303,9 +303,9 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc ; ; G_SI-LABEL: raw_buffer_atomic_max_noret_f64: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; G_SI-NEXT: s_load_dword s6, s[0:1], 0xf -; G_SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; G_SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; G_SI-NEXT: s_load_dword s6, s[2:3], 0xf +; G_SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v0, s4 ; G_SI-NEXT: v_mov_b32_e32 v1, s5 @@ -315,9 +315,9 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc ; ; G_GFX7-LABEL: raw_buffer_atomic_max_noret_f64: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; G_GFX7-NEXT: s_load_dword s6, s[0:1], 0xf -; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; G_GFX7-NEXT: s_load_dword s6, s[2:3], 0xf +; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -328,12 +328,12 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc ; G_GFX10-LABEL: raw_buffer_atomic_max_noret_f64: ; G_GFX10: ; %bb.0: ; %main_body ; G_GFX10-NEXT: s_clause 0x2 -; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; G_GFX10-NEXT: s_load_dword s8, s[0:1], 0x3c -; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; G_GFX10-NEXT: s_load_dword s8, s[2:3], 0x3c +; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s2 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s3 +; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 ; G_GFX10-NEXT: v_mov_b32_e32 v2, s8 ; G_GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen ; G_GFX10-NEXT: s_endpgm @@ -341,9 +341,9 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc ; G_GFX1030-LABEL: raw_buffer_atomic_max_noret_f64: ; G_GFX1030: ; %bb.0: ; %main_body ; G_GFX1030-NEXT: s_clause 0x2 -; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; G_GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c -; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; G_GFX1030-NEXT: s_load_dword s6, s[2:3], 0x3c +; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -424,7 +424,7 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, ptr addrspace(3) %out) { ; SI-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -438,7 +438,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inre ; ; GFX7-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -452,7 +452,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inre ; ; GFX10-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-NEXT: v_mov_b32_e32 v1, s9 @@ -465,7 +465,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inre ; ; GFX1030-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -478,7 +478,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inre ; ; G_SI-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; G_SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; G_SI-NEXT: s_mov_b32 m0, -1 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v0, s4 @@ -492,7 +492,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inre ; ; G_GFX7-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; G_GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; G_GFX7-NEXT: s_mov_b32 m0, -1 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -506,7 +506,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inre ; ; G_GFX10-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; G_GFX10: ; %bb.0: ; %main_body -; G_GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; G_GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX10-NEXT: v_mov_b32_e32 v0, s8 ; G_GFX10-NEXT: v_mov_b32_e32 v1, s9 @@ -519,7 +519,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inre ; ; G_GFX1030-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; G_GFX1030: ; %bb.0: ; %main_body -; G_GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; G_GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 diff --git a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll index 3321948192bda..ff6700d10ff53 100644 --- a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll @@ -16,9 +16,9 @@ declare double @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f64(double, ptr addrspace define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) { ; SI-LABEL: raw_ptr_buffer_atomic_min_noret_f64: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_load_dword s6, s[0:1], 0xf -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dword s6, s[2:3], 0xf +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -28,9 +28,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) ; ; GFX7-LABEL: raw_ptr_buffer_atomic_min_noret_f64: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GFX7-NEXT: s_load_dword s6, s[0:1], 0xf -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GFX7-NEXT: s_load_dword s6, s[2:3], 0xf +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -41,12 +41,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) ; GFX10-LABEL: raw_ptr_buffer_atomic_min_noret_f64: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-NEXT: s_load_dword s8, s[0:1], 0x3c -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s8 ; GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen ; GFX10-NEXT: s_endpgm @@ -54,9 +54,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) ; GFX1030-LABEL: raw_ptr_buffer_atomic_min_noret_f64: ; GFX1030: ; %bb.0: ; %main_body ; GFX1030-NEXT: s_clause 0x2 -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX1030-NEXT: s_load_dword s6, s[2:3], 0x3c +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -66,9 +66,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) ; ; G_SI-LABEL: raw_ptr_buffer_atomic_min_noret_f64: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; G_SI-NEXT: s_load_dword s6, s[0:1], 0xf -; G_SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; G_SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; G_SI-NEXT: s_load_dword s6, s[2:3], 0xf +; G_SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v0, s4 ; G_SI-NEXT: v_mov_b32_e32 v1, s5 @@ -78,9 +78,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) ; ; G_GFX7-LABEL: raw_ptr_buffer_atomic_min_noret_f64: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; G_GFX7-NEXT: s_load_dword s6, s[0:1], 0xf -; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; G_GFX7-NEXT: s_load_dword s6, s[2:3], 0xf +; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -91,12 +91,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) ; G_GFX10-LABEL: raw_ptr_buffer_atomic_min_noret_f64: ; G_GFX10: ; %bb.0: ; %main_body ; G_GFX10-NEXT: s_clause 0x2 -; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; G_GFX10-NEXT: s_load_dword s8, s[0:1], 0x3c -; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; G_GFX10-NEXT: s_load_dword s8, s[2:3], 0x3c +; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s2 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s3 +; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 ; G_GFX10-NEXT: v_mov_b32_e32 v2, s8 ; G_GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen ; G_GFX10-NEXT: s_endpgm @@ -104,9 +104,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) ; G_GFX1030-LABEL: raw_ptr_buffer_atomic_min_noret_f64: ; G_GFX1030: ; %bb.0: ; %main_body ; G_GFX1030-NEXT: s_clause 0x2 -; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; G_GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c -; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; G_GFX1030-NEXT: s_load_dword s6, s[2:3], 0x3c +; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -253,9 +253,9 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) { ; SI-LABEL: raw_ptr_buffer_atomic_max_noret_f64: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_load_dword s6, s[0:1], 0xf -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dword s6, s[2:3], 0xf +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -265,9 +265,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) ; ; GFX7-LABEL: raw_ptr_buffer_atomic_max_noret_f64: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GFX7-NEXT: s_load_dword s6, s[0:1], 0xf -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GFX7-NEXT: s_load_dword s6, s[2:3], 0xf +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -278,12 +278,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) ; GFX10-LABEL: raw_ptr_buffer_atomic_max_noret_f64: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-NEXT: s_load_dword s8, s[0:1], 0x3c -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s8 ; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen ; GFX10-NEXT: s_endpgm @@ -291,9 +291,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) ; GFX1030-LABEL: raw_ptr_buffer_atomic_max_noret_f64: ; GFX1030: ; %bb.0: ; %main_body ; GFX1030-NEXT: s_clause 0x2 -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX1030-NEXT: s_load_dword s6, s[2:3], 0x3c +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -303,9 +303,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) ; ; G_SI-LABEL: raw_ptr_buffer_atomic_max_noret_f64: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; G_SI-NEXT: s_load_dword s6, s[0:1], 0xf -; G_SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; G_SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; G_SI-NEXT: s_load_dword s6, s[2:3], 0xf +; G_SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v0, s4 ; G_SI-NEXT: v_mov_b32_e32 v1, s5 @@ -315,9 +315,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) ; ; G_GFX7-LABEL: raw_ptr_buffer_atomic_max_noret_f64: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; G_GFX7-NEXT: s_load_dword s6, s[0:1], 0xf -; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; G_GFX7-NEXT: s_load_dword s6, s[2:3], 0xf +; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -328,12 +328,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) ; G_GFX10-LABEL: raw_ptr_buffer_atomic_max_noret_f64: ; G_GFX10: ; %bb.0: ; %main_body ; G_GFX10-NEXT: s_clause 0x2 -; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; G_GFX10-NEXT: s_load_dword s8, s[0:1], 0x3c -; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; G_GFX10-NEXT: s_load_dword s8, s[2:3], 0x3c +; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s2 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s3 +; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 ; G_GFX10-NEXT: v_mov_b32_e32 v2, s8 ; G_GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen ; G_GFX10-NEXT: s_endpgm @@ -341,9 +341,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) ; G_GFX1030-LABEL: raw_ptr_buffer_atomic_max_noret_f64: ; G_GFX1030: ; %bb.0: ; %main_body ; G_GFX1030-NEXT: s_clause 0x2 -; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; G_GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c -; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; G_GFX1030-NEXT: s_load_dword s6, s[2:3], 0x3c +; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -424,7 +424,7 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex, ptr addrspace(3) %out) { ; SI-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -438,7 +438,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp ; ; GFX7-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -452,7 +452,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp ; ; GFX10-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-NEXT: v_mov_b32_e32 v1, s9 @@ -465,7 +465,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp ; ; GFX1030-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -478,7 +478,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp ; ; G_SI-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; G_SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; G_SI-NEXT: s_mov_b32 m0, -1 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v0, s4 @@ -492,7 +492,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp ; ; G_GFX7-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; G_GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; G_GFX7-NEXT: s_mov_b32 m0, -1 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -506,7 +506,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp ; ; G_GFX10-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; G_GFX10: ; %bb.0: ; %main_body -; G_GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; G_GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX10-NEXT: v_mov_b32_e32 v0, s8 ; G_GFX10-NEXT: v_mov_b32_e32 v1, s9 @@ -519,7 +519,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp ; ; G_GFX1030-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; G_GFX1030: ; %bb.0: ; %main_body -; G_GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; G_GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll index 04ef30bd26aa5..3571f3545ad1a 100644 --- a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll +++ b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll @@ -8,8 +8,8 @@ declare float @llvm.fabs.f32(float) #1 define amdgpu_kernel void @fp_to_sint_i32(ptr addrspace(1) %out, float %in) { ; SI-LABEL: fp_to_sint_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -19,12 +19,12 @@ define amdgpu_kernel void @fp_to_sint_i32(ptr addrspace(1) %out, float %in) { ; ; VI-LABEL: fp_to_sint_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cvt_i32_f32_e32 v0, s2 ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_cvt_i32_f32_e32 v0, s4 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -47,8 +47,8 @@ define amdgpu_kernel void @fp_to_sint_i32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @fp_to_sint_i32_fabs(ptr addrspace(1) %out, float %in) { ; SI-LABEL: fp_to_sint_i32_fabs: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -58,12 +58,12 @@ define amdgpu_kernel void @fp_to_sint_i32_fabs(ptr addrspace(1) %out, float %in) ; ; VI-LABEL: fp_to_sint_i32_fabs: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cvt_i32_f32_e64 v0, |s2| ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_cvt_i32_f32_e64 v0, |s4| ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -87,7 +87,7 @@ define amdgpu_kernel void @fp_to_sint_i32_fabs(ptr addrspace(1) %out, float %in) define amdgpu_kernel void @fp_to_sint_v2i32(ptr addrspace(1) %out, <2 x float> %in) { ; SI-LABEL: fp_to_sint_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -100,7 +100,7 @@ define amdgpu_kernel void @fp_to_sint_v2i32(ptr addrspace(1) %out, <2 x float> % ; ; VI-LABEL: fp_to_sint_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -132,7 +132,7 @@ define amdgpu_kernel void @fp_to_sint_v2i32(ptr addrspace(1) %out, <2 x float> % define amdgpu_kernel void @fp_to_sint_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: fp_to_sint_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -147,7 +147,7 @@ define amdgpu_kernel void @fp_to_sint_v4i32(ptr addrspace(1) %out, ptr addrspace ; ; VI-LABEL: fp_to_sint_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; VI-NEXT: s_mov_b32 s3, 0xf000 @@ -193,37 +193,37 @@ define amdgpu_kernel void @fp_to_sint_v4i32(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @fp_to_sint_i64 (ptr addrspace(1) %out, float %in) { ; SI-LABEL: fp_to_sint_i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s1, 0x2f800000 -; SI-NEXT: s_mov_b32 s2, 0xcf800000 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s5, 0x2f800000 +; SI-NEXT: s_mov_b32 s6, 0xcf800000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_trunc_f32_e32 v0, s0 -; SI-NEXT: v_mul_f32_e64 v1, |v0|, s1 +; SI-NEXT: v_trunc_f32_e32 v0, s4 +; SI-NEXT: v_mul_f32_e64 v1, |v0|, s5 ; SI-NEXT: v_ashrrev_i32_e32 v2, 31, v0 ; SI-NEXT: v_floor_f32_e32 v1, v1 ; SI-NEXT: v_cvt_u32_f32_e32 v3, v1 -; SI-NEXT: v_fma_f32 v0, v1, s2, |v0| +; SI-NEXT: v_fma_f32 v0, v1, s6, |v0| ; SI-NEXT: v_cvt_u32_f32_e32 v0, v0 ; SI-NEXT: v_xor_b32_e32 v1, v3, v2 ; SI-NEXT: v_xor_b32_e32 v0, v0, v2 ; SI-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; SI-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fp_to_sint_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s4, 0x2f800000 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_mov_b32 s2, 0x2f800000 ; VI-NEXT: s_mov_b32 s5, 0xcf800000 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_trunc_f32_e32 v0, s2 -; VI-NEXT: v_mul_f32_e64 v1, |v0|, s4 +; VI-NEXT: v_trunc_f32_e32 v0, s4 +; VI-NEXT: v_mul_f32_e64 v1, |v0|, s2 ; VI-NEXT: v_floor_f32_e32 v1, v1 ; VI-NEXT: v_fma_f32 v2, v1, s5, |v0| ; VI-NEXT: v_cvt_u32_f32_e32 v2, v2 @@ -294,7 +294,7 @@ entry: define amdgpu_kernel void @fp_to_sint_v2i64(ptr addrspace(1) %out, <2 x float> %x) { ; SI-LABEL: fp_to_sint_v2i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s8, 0x2f800000 @@ -329,7 +329,7 @@ define amdgpu_kernel void @fp_to_sint_v2i64(ptr addrspace(1) %out, <2 x float> % ; ; VI-LABEL: fp_to_sint_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s8, 0x2f800000 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -452,17 +452,17 @@ define amdgpu_kernel void @fp_to_sint_v2i64(ptr addrspace(1) %out, <2 x float> % define amdgpu_kernel void @fp_to_sint_v4i64(ptr addrspace(1) %out, <4 x float> %x) { ; SI-LABEL: fp_to_sint_v4i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s8, 0x2f800000 ; SI-NEXT: s_mov_b32 s9, 0xcf800000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_trunc_f32_e32 v0, s1 -; SI-NEXT: v_trunc_f32_e32 v1, s0 -; SI-NEXT: v_trunc_f32_e32 v2, s3 -; SI-NEXT: v_trunc_f32_e32 v3, s2 +; SI-NEXT: v_trunc_f32_e32 v0, s5 +; SI-NEXT: v_trunc_f32_e32 v1, s4 +; SI-NEXT: v_trunc_f32_e32 v2, s7 +; SI-NEXT: v_trunc_f32_e32 v3, s6 ; SI-NEXT: v_mul_f32_e64 v4, |v0|, s8 ; SI-NEXT: v_ashrrev_i32_e32 v5, 31, v0 ; SI-NEXT: v_mul_f32_e64 v6, |v1|, s8 @@ -503,14 +503,14 @@ define amdgpu_kernel void @fp_to_sint_v4i64(ptr addrspace(1) %out, <4 x float> % ; SI-NEXT: v_subb_u32_e32 v7, vcc, v12, v9, vcc ; SI-NEXT: v_sub_i32_e32 v4, vcc, v13, v11 ; SI-NEXT: v_subb_u32_e32 v5, vcc, v8, v11, vcc -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fp_to_sint_v4i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s8, 0x2f800000 ; VI-NEXT: s_mov_b32 s9, 0xcf800000 ; VI-NEXT: s_mov_b32 s3, 0xf000 @@ -737,8 +737,8 @@ define amdgpu_kernel void @fp_to_sint_v4i64(ptr addrspace(1) %out, <4 x float> % define amdgpu_kernel void @fp_to_uint_f32_to_i1(ptr addrspace(1) %out, float %in) #0 { ; SI-LABEL: fp_to_uint_f32_to_i1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -749,8 +749,8 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i1(ptr addrspace(1) %out, float %in ; ; VI-LABEL: fp_to_uint_f32_to_i1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -787,8 +787,8 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i1(ptr addrspace(1) %out, float %in define amdgpu_kernel void @fp_to_uint_fabs_f32_to_i1(ptr addrspace(1) %out, float %in) #0 { ; SI-LABEL: fp_to_uint_fabs_f32_to_i1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -799,8 +799,8 @@ define amdgpu_kernel void @fp_to_uint_fabs_f32_to_i1(ptr addrspace(1) %out, floa ; ; VI-LABEL: fp_to_uint_fabs_f32_to_i1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -838,8 +838,8 @@ define amdgpu_kernel void @fp_to_uint_fabs_f32_to_i1(ptr addrspace(1) %out, floa define amdgpu_kernel void @fp_to_sint_f32_i16(ptr addrspace(1) %out, float %in) #0 { ; SI-LABEL: fp_to_sint_f32_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -849,12 +849,12 @@ define amdgpu_kernel void @fp_to_sint_f32_i16(ptr addrspace(1) %out, float %in) ; ; VI-LABEL: fp_to_sint_f32_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cvt_i32_f32_e32 v0, s2 ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_cvt_i32_f32_e32 v0, s4 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll index 5abf82aa1aab5..c6b4e129bacbe 100644 --- a/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll +++ b/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll @@ -8,8 +8,8 @@ declare float @llvm.fabs.f32(float) #1 define amdgpu_kernel void @fp_to_uint_f32_to_i32 (ptr addrspace(1) %out, float %in) { ; SI-LABEL: fp_to_uint_f32_to_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -19,12 +19,12 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i32 (ptr addrspace(1) %out, float % ; ; VI-LABEL: fp_to_uint_f32_to_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cvt_u32_f32_e32 v0, s2 ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_cvt_u32_f32_e32 v0, s4 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -47,7 +47,7 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i32 (ptr addrspace(1) %out, float % define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i32(ptr addrspace(1) %out, <2 x float> %in) { ; SI-LABEL: fp_to_uint_v2f32_to_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -60,7 +60,7 @@ define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i32(ptr addrspace(1) %out, <2 x ; ; VI-LABEL: fp_to_uint_v2f32_to_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -92,7 +92,7 @@ define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i32(ptr addrspace(1) %out, <2 x define amdgpu_kernel void @fp_to_uint_v4f32_to_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: fp_to_uint_v4f32_to_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -107,7 +107,7 @@ define amdgpu_kernel void @fp_to_uint_v4f32_to_v4i32(ptr addrspace(1) %out, ptr ; ; VI-LABEL: fp_to_uint_v4f32_to_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; VI-NEXT: s_mov_b32 s3, 0xf000 @@ -152,34 +152,34 @@ define amdgpu_kernel void @fp_to_uint_v4f32_to_v4i32(ptr addrspace(1) %out, ptr define amdgpu_kernel void @fp_to_uint_f32_to_i64(ptr addrspace(1) %out, float %x) { ; SI-LABEL: fp_to_uint_f32_to_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s1, 0xcf800000 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s5, 0xcf800000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_trunc_f32_e32 v0, s0 +; SI-NEXT: v_trunc_f32_e32 v0, s4 ; SI-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; SI-NEXT: v_floor_f32_e32 v2, v1 ; SI-NEXT: v_cvt_u32_f32_e32 v1, v2 -; SI-NEXT: v_fma_f32 v0, v2, s1, v0 +; SI-NEXT: v_fma_f32 v0, v2, s5, v0 ; SI-NEXT: v_cvt_u32_f32_e32 v0, v0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fp_to_uint_f32_to_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xcf800000 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_mov_b32 s2, 0xcf800000 +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_trunc_f32_e32 v0, s2 +; VI-NEXT: v_trunc_f32_e32 v0, s4 ; VI-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; VI-NEXT: v_floor_f32_e32 v2, v1 -; VI-NEXT: v_fma_f32 v0, v2, s3, v0 +; VI-NEXT: v_fma_f32 v0, v2, s2, v0 ; VI-NEXT: v_cvt_u32_f32_e32 v1, v2 ; VI-NEXT: v_cvt_u32_f32_e32 v0, v0 -; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm @@ -240,7 +240,7 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i64(ptr addrspace(1) %out, float %x define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i64(ptr addrspace(1) %out, <2 x float> %x) { ; SI-LABEL: fp_to_uint_v2f32_to_v2i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s8, 0xcf800000 @@ -264,7 +264,7 @@ define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i64(ptr addrspace(1) %out, <2 x ; ; VI-LABEL: fp_to_uint_v2f32_to_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -376,16 +376,16 @@ define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i64(ptr addrspace(1) %out, <2 x define amdgpu_kernel void @fp_to_uint_v4f32_to_v4i64(ptr addrspace(1) %out, <4 x float> %x) { ; SI-LABEL: fp_to_uint_v4f32_to_v4i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s8, 0xcf800000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_trunc_f32_e32 v0, s1 -; SI-NEXT: v_trunc_f32_e32 v2, s0 -; SI-NEXT: v_trunc_f32_e32 v4, s3 -; SI-NEXT: v_trunc_f32_e32 v6, s2 +; SI-NEXT: v_trunc_f32_e32 v0, s5 +; SI-NEXT: v_trunc_f32_e32 v2, s4 +; SI-NEXT: v_trunc_f32_e32 v4, s7 +; SI-NEXT: v_trunc_f32_e32 v6, s6 ; SI-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; SI-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; SI-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 @@ -406,14 +406,14 @@ define amdgpu_kernel void @fp_to_uint_v4f32_to_v4i64(ptr addrspace(1) %out, <4 x ; SI-NEXT: v_cvt_u32_f32_e32 v0, v8 ; SI-NEXT: v_cvt_u32_f32_e32 v6, v4 ; SI-NEXT: v_cvt_u32_f32_e32 v4, v9 -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fp_to_uint_v4f32_to_v4i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s2, 0xcf800000 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -619,8 +619,8 @@ define amdgpu_kernel void @fp_to_uint_v4f32_to_v4i64(ptr addrspace(1) %out, <4 x define amdgpu_kernel void @fp_to_uint_f32_to_i1(ptr addrspace(1) %out, float %in) #0 { ; SI-LABEL: fp_to_uint_f32_to_i1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -631,8 +631,8 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i1(ptr addrspace(1) %out, float %in ; ; VI-LABEL: fp_to_uint_f32_to_i1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -669,8 +669,8 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i1(ptr addrspace(1) %out, float %in define amdgpu_kernel void @fp_to_uint_fabs_f32_to_i1(ptr addrspace(1) %out, float %in) #0 { ; SI-LABEL: fp_to_uint_fabs_f32_to_i1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -681,8 +681,8 @@ define amdgpu_kernel void @fp_to_uint_fabs_f32_to_i1(ptr addrspace(1) %out, floa ; ; VI-LABEL: fp_to_uint_fabs_f32_to_i1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -720,8 +720,8 @@ define amdgpu_kernel void @fp_to_uint_fabs_f32_to_i1(ptr addrspace(1) %out, floa define amdgpu_kernel void @fp_to_uint_f32_to_i16(ptr addrspace(1) %out, float %in) #0 { ; SI-LABEL: fp_to_uint_f32_to_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -731,12 +731,12 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i16(ptr addrspace(1) %out, float %i ; ; VI-LABEL: fp_to_uint_f32_to_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cvt_u32_f32_e32 v0, s2 ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_cvt_u32_f32_e32 v0, s4 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/fshl.ll b/llvm/test/CodeGen/AMDGPU/fshl.ll index 3c4087fe391b6..4f230140f7ba4 100644 --- a/llvm/test/CodeGen/AMDGPU/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/fshl.ll @@ -13,51 +13,49 @@ declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) nounwind rea define amdgpu_kernel void @fshl_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z) { ; SI-LABEL: fshl_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: v_mov_b32_e32 v0, s7 -; SI-NEXT: s_not_b32 s5, s8 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: v_alignbit_b32 v0, s6, v0, 1 -; SI-NEXT: s_lshr_b32 s4, s6, 1 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_alignbit_b32 v0, s4, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: s_lshr_b32 s5, s4, 1 +; SI-NEXT: v_alignbit_b32 v0, s4, v0, 1 +; SI-NEXT: s_not_b32 s4, s6 +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: v_alignbit_b32 v0, s5, v0, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshl_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s7 -; VI-NEXT: s_not_b32 s0, s0 -; VI-NEXT: s_lshr_b32 s1, s6, 1 -; VI-NEXT: v_alignbit_b32 v0, s6, v0, 1 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_alignbit_b32 v2, s1, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v0, s5 +; VI-NEXT: s_not_b32 s3, s6 +; VI-NEXT: s_lshr_b32 s2, s4, 1 +; VI-NEXT: v_alignbit_b32 v0, s4, v0, 1 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_alignbit_b32 v2, s2, v0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshl_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_not_b32 s1, s2 -; GFX9-NEXT: s_lshr_b32 s0, s6, 1 -; GFX9-NEXT: v_alignbit_b32 v1, s6, v1, 1 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_alignbit_b32 v1, s0, v1, v2 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_not_b32 s3, s6 +; GFX9-NEXT: s_lshr_b32 s2, s4, 1 +; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 1 +; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: v_alignbit_b32 v1, s2, v1, v2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshl_i32: @@ -77,30 +75,30 @@ define amdgpu_kernel void @fshl_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z ; GFX10-LABEL: fshl_i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v0, s6, s7, 1 -; GFX10-NEXT: s_lshr_b32 s0, s6, 1 -; GFX10-NEXT: s_not_b32 s1, s2 -; GFX10-NEXT: v_alignbit_b32 v0, s0, v0, s1 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: v_alignbit_b32 v0, s4, s5, 1 +; GFX10-NEXT: s_lshr_b32 s2, s4, 1 +; GFX10-NEXT: s_not_b32 s3, s6 +; GFX10-NEXT: v_alignbit_b32 v0, s2, v0, s3 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshl_i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v0, s6, s7, 1 -; GFX11-NEXT: s_lshr_b32 s1, s6, 1 -; GFX11-NEXT: s_not_b32 s0, s0 +; GFX11-NEXT: v_alignbit_b32 v0, s4, s5, 1 +; GFX11-NEXT: s_lshr_b32 s2, s4, 1 +; GFX11-NEXT: s_not_b32 s3, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_alignbit_b32 v0, s1, v0, s0 -; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: v_alignbit_b32 v0, s2, v0, s3 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -113,7 +111,7 @@ entry: define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; SI-LABEL: fshl_i32_imm: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -126,7 +124,7 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; VI-LABEL: fshl_i32_imm: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s3 ; VI-NEXT: v_alignbit_b32 v2, s2, v0, 25 @@ -137,7 +135,7 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX9-LABEL: fshl_i32_imm: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s7 @@ -159,7 +157,7 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX10-LABEL: fshl_i32_imm: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_alignbit_b32 v1, s6, s7, 25 @@ -168,7 +166,7 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX11-LABEL: fshl_i32_imm: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_alignbit_b32 v1, s2, s3, 25 @@ -185,15 +183,15 @@ entry: define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) { ; SI-LABEL: fshl_v2i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xf +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s7 -; SI-NEXT: v_alignbit_b32 v0, s5, v0, 1 ; SI-NEXT: s_not_b32 s1, s1 +; SI-NEXT: v_alignbit_b32 v0, s5, v0, 1 ; SI-NEXT: s_lshr_b32 s2, s5, 1 ; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: v_alignbit_b32 v1, s2, v0, v1 @@ -208,47 +206,47 @@ define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; ; VI-LABEL: fshl_v2i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s7 -; VI-NEXT: s_not_b32 s3, s3 +; VI-NEXT: s_not_b32 s1, s1 ; VI-NEXT: s_lshr_b32 s7, s5, 1 ; VI-NEXT: v_alignbit_b32 v0, s5, v0, 1 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_alignbit_b32 v1, s7, v0, v1 ; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: s_not_b32 s2, s2 +; VI-NEXT: s_not_b32 s0, s0 ; VI-NEXT: v_alignbit_b32 v0, s4, v0, 1 -; VI-NEXT: s_lshr_b32 s3, s4, 1 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_alignbit_b32 v0, s3, v0, v2 -; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: s_lshr_b32 s1, s4, 1 ; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_alignbit_b32 v0, s1, v0, v2 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshl_v2i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x3c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: s_lshr_b32 s0, s5, 1 -; GFX9-NEXT: s_not_b32 s1, s9 +; GFX9-NEXT: s_lshr_b32 s2, s5, 1 +; GFX9-NEXT: s_not_b32 s3, s9 ; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 1 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_alignbit_b32 v1, s0, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_alignbit_b32 v1, s2, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_not_b32 s1, s8 +; GFX9-NEXT: s_not_b32 s3, s8 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 1 -; GFX9-NEXT: s_lshr_b32 s0, s4, 1 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: s_lshr_b32 s2, s4, 1 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_alignbit_b32 v0, s2, v0, v3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshl_v2i32: @@ -272,39 +270,39 @@ define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX10-LABEL: fshl_v2i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_alignbit_b32 v0, s5, s7, 1 ; GFX10-NEXT: v_alignbit_b32 v3, s4, s6, 1 -; GFX10-NEXT: s_lshr_b32 s0, s5, 1 -; GFX10-NEXT: s_not_b32 s1, s3 +; GFX10-NEXT: s_lshr_b32 s2, s5, 1 +; GFX10-NEXT: s_not_b32 s1, s1 ; GFX10-NEXT: s_lshr_b32 s3, s4, 1 -; GFX10-NEXT: s_not_b32 s2, s2 -; GFX10-NEXT: v_alignbit_b32 v1, s0, v0, s1 -; GFX10-NEXT: v_alignbit_b32 v0, s3, v3, s2 +; GFX10-NEXT: s_not_b32 s0, s0 +; GFX10-NEXT: v_alignbit_b32 v1, s2, v0, s1 +; GFX10-NEXT: v_alignbit_b32 v0, s3, v3, s0 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshl_v2i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x3c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x3c +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_alignbit_b32 v0, s5, s7, 1 ; GFX11-NEXT: v_alignbit_b32 v3, s4, s6, 1 ; GFX11-NEXT: s_lshr_b32 s5, s5, 1 -; GFX11-NEXT: s_not_b32 s3, s3 +; GFX11-NEXT: s_not_b32 s1, s1 ; GFX11-NEXT: s_lshr_b32 s4, s4, 1 -; GFX11-NEXT: s_not_b32 s2, s2 -; GFX11-NEXT: v_alignbit_b32 v1, s5, v0, s3 -; GFX11-NEXT: v_alignbit_b32 v0, s4, v3, s2 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_not_b32 s0, s0 +; GFX11-NEXT: v_alignbit_b32 v1, s5, v0, s1 +; GFX11-NEXT: v_alignbit_b32 v0, s4, v3, s0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -317,8 +315,8 @@ entry: define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y) { ; SI-LABEL: fshl_v2i32_imm: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -331,8 +329,8 @@ define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; ; VI-LABEL: fshl_v2i32_imm: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s7 ; VI-NEXT: v_mov_b32_e32 v2, s6 @@ -345,15 +343,15 @@ define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; ; GFX9-LABEL: fshl_v2i32_imm: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s7 ; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 23 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v3, 25 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshl_v2i32_imm: @@ -373,20 +371,20 @@ define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; GFX10-LABEL: fshl_v2i32_imm: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_alignbit_b32 v1, s5, s7, 23 ; GFX10-NEXT: v_alignbit_b32 v0, s4, s6, 25 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshl_v2i32_imm: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_alignbit_b32 v1, s5, s7, 23 @@ -404,44 +402,44 @@ entry: define amdgpu_kernel void @fshl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; SI-LABEL: fshl_v4i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd -; SI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x15 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x15 +; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s15, 0xf000 +; SI-NEXT: s_mov_b32 s14, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_not_b32 s1, s19 ; SI-NEXT: v_mov_b32_e32 v0, s11 -; SI-NEXT: s_not_b32 s11, s15 ; SI-NEXT: v_alignbit_b32 v0, s7, v0, 1 -; SI-NEXT: s_lshr_b32 s7, s7, 1 -; SI-NEXT: v_mov_b32_e32 v1, s11 -; SI-NEXT: v_alignbit_b32 v3, s7, v0, v1 +; SI-NEXT: s_lshr_b32 s0, s7, 1 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_alignbit_b32 v3, s0, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s10 -; SI-NEXT: s_not_b32 s7, s14 +; SI-NEXT: s_not_b32 s1, s18 ; SI-NEXT: v_alignbit_b32 v0, s6, v0, 1 -; SI-NEXT: s_lshr_b32 s6, s6, 1 -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: v_alignbit_b32 v2, s6, v0, v1 +; SI-NEXT: s_lshr_b32 s0, s6, 1 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_alignbit_b32 v2, s0, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s9 -; SI-NEXT: s_not_b32 s6, s13 +; SI-NEXT: s_not_b32 s1, s17 ; SI-NEXT: v_alignbit_b32 v0, s5, v0, 1 -; SI-NEXT: s_lshr_b32 s5, s5, 1 -; SI-NEXT: v_mov_b32_e32 v1, s6 -; SI-NEXT: v_alignbit_b32 v1, s5, v0, v1 +; SI-NEXT: s_lshr_b32 s0, s5, 1 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_alignbit_b32 v1, s0, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: s_not_b32 s5, s12 +; SI-NEXT: s_not_b32 s1, s16 ; SI-NEXT: v_alignbit_b32 v0, s4, v0, 1 -; SI-NEXT: s_lshr_b32 s4, s4, 1 -; SI-NEXT: v_mov_b32_e32 v4, s5 -; SI-NEXT: v_alignbit_b32 v0, s4, v0, v4 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: s_lshr_b32 s0, s4, 1 +; SI-NEXT: v_mov_b32_e32 v4, s1 +; SI-NEXT: v_alignbit_b32 v0, s0, v0, v4 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshl_v4i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x54 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s11 ; VI-NEXT: s_not_b32 s3, s15 @@ -474,36 +472,36 @@ define amdgpu_kernel void @fshl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; ; GFX9-LABEL: fshl_v4i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x54 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_not_b32 s1, s15 +; GFX9-NEXT: s_not_b32 s3, s15 ; GFX9-NEXT: v_mov_b32_e32 v0, s11 -; GFX9-NEXT: s_lshr_b32 s0, s7, 1 +; GFX9-NEXT: s_lshr_b32 s2, s7, 1 ; GFX9-NEXT: v_alignbit_b32 v0, s7, v0, 1 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_alignbit_b32 v3, s0, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_alignbit_b32 v3, s2, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s10 -; GFX9-NEXT: s_not_b32 s1, s14 +; GFX9-NEXT: s_not_b32 s3, s14 ; GFX9-NEXT: v_alignbit_b32 v0, s6, v0, 1 -; GFX9-NEXT: s_lshr_b32 s0, s6, 1 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_alignbit_b32 v2, s0, v0, v1 +; GFX9-NEXT: s_lshr_b32 s2, s6, 1 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_alignbit_b32 v2, s2, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s9 -; GFX9-NEXT: s_not_b32 s1, s13 +; GFX9-NEXT: s_not_b32 s3, s13 ; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 1 -; GFX9-NEXT: s_lshr_b32 s0, s5, 1 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_alignbit_b32 v1, s0, v0, v1 +; GFX9-NEXT: s_lshr_b32 s2, s5, 1 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_alignbit_b32 v1, s2, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: s_not_b32 s1, s12 +; GFX9-NEXT: s_not_b32 s3, s12 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 1 -; GFX9-NEXT: s_lshr_b32 s0, s4, 1 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v5 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX9-NEXT: s_lshr_b32 s2, s4, 1 +; GFX9-NEXT: v_mov_b32_e32 v5, s3 +; GFX9-NEXT: v_alignbit_b32 v0, s2, v0, v5 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshl_v4i32: @@ -534,11 +532,11 @@ define amdgpu_kernel void @fshl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; ; GFX10-LABEL: fshl_v4i32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 +; GFX10-NEXT: s_clause 0x2 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x54 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_alignbit_b32 v0, s7, s11, 1 ; GFX10-NEXT: v_alignbit_b32 v1, s6, s10, 1 @@ -562,9 +560,9 @@ define amdgpu_kernel void @fshl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX11-LABEL: fshl_v4i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34 -; GFX11-NEXT: s_load_b128 s[12:15], s[0:1], 0x54 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[12:15], s[2:3], 0x54 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_alignbit_b32 v0, s7, s11, 1 @@ -596,10 +594,10 @@ entry: define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 x i32> %y) { ; SI-LABEL: fshl_v4i32_imm: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s15, 0xf000 +; SI-NEXT: s_mov_b32 s14, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s11 ; SI-NEXT: v_mov_b32_e32 v1, s10 @@ -609,13 +607,13 @@ define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; SI-NEXT: v_alignbit_b32 v1, s5, v0, 25 ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_alignbit_b32 v0, s4, v0, 31 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshl_v4i32_imm: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s11 ; VI-NEXT: v_mov_b32_e32 v1, s10 @@ -632,9 +630,9 @@ define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; ; GFX9-LABEL: fshl_v4i32_imm: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s11 ; GFX9-NEXT: v_mov_b32_e32 v1, s10 @@ -668,22 +666,22 @@ define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; GFX10-LABEL: fshl_v4i32_imm: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_alignbit_b32 v3, s7, s11, 31 ; GFX10-NEXT: v_alignbit_b32 v2, s6, s10, 23 ; GFX10-NEXT: v_alignbit_b32 v1, s5, s9, 25 ; GFX10-NEXT: v_alignbit_b32 v0, s4, s8, 31 -; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshl_v4i32_imm: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_alignbit_b32 v3, s7, s11, 31 @@ -704,7 +702,7 @@ entry: define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) { ; SI-LABEL: orxor2or1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -720,7 +718,7 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) { ; ; VI-LABEL: orxor2or1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b32 s4, s2, 7 ; VI-NEXT: s_or_b32 s4, s3, s4 @@ -734,7 +732,7 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) { ; ; GFX9-LABEL: orxor2or1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s0, s6, 7 @@ -761,7 +759,7 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) { ; ; GFX10-LABEL: orxor2or1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_lshl_b32 s0, s6, 7 @@ -774,7 +772,7 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) { ; ; GFX11-LABEL: orxor2or1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshl_b32 s4, s2, 7 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll index e8377763e4be2..31f574d44ab8c 100644 --- a/llvm/test/CodeGen/AMDGPU/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/fshr.ll @@ -22,42 +22,40 @@ declare <2 x i24> @llvm.fshr.v2i24(<2 x i24>, <2 x i24>, <2 x i24>) define amdgpu_kernel void @fshr_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z) { ; SI-LABEL: fshr_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s7 -; SI-NEXT: v_mov_b32_e32 v1, s8 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: v_alignbit_b32 v0, s6, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: v_alignbit_b32 v0, s4, v0, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshr_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s7 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_alignbit_b32 v2, s6, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v0, s5 +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_alignbit_b32 v2, s4, v0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshr_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_alignbit_b32 v1, s6, v1, v2 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, v2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshr_i32: @@ -74,24 +72,24 @@ define amdgpu_kernel void @fshr_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z ; GFX10-LABEL: fshr_i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_alignbit_b32 v0, s6, s7, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_alignbit_b32 v0, s4, s5, v0 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshr_i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_alignbit_b32 v0, s2, s3, v0 +; GFX11-NEXT: v_alignbit_b32 v0, s4, s5, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -105,7 +103,7 @@ entry: define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; SI-LABEL: fshr_i32_imm: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -118,7 +116,7 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; VI-LABEL: fshr_i32_imm: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s3 ; VI-NEXT: v_alignbit_b32 v2, s2, v0, 7 @@ -129,7 +127,7 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX9-LABEL: fshr_i32_imm: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s7 @@ -151,7 +149,7 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX10-LABEL: fshr_i32_imm: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_alignbit_b32 v1, s6, s7, 7 @@ -160,7 +158,7 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX11-LABEL: fshr_i32_imm: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_alignbit_b32 v1, s2, s3, 7 @@ -177,9 +175,9 @@ entry: define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) { ; SI-LABEL: fshr_v2i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -194,33 +192,33 @@ define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; ; VI-LABEL: fshr_v2i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s7 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_alignbit_b32 v1, s5, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_alignbit_b32 v0, s4, v2, v0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshr_v2i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, v3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX9-NEXT: s_endpgm @@ -242,13 +240,13 @@ define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX10-LABEL: fshr_v2i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s3 -; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: v_alignbit_b32 v1, s5, s7, v0 ; GFX10-NEXT: v_alignbit_b32 v0, s4, s6, v2 ; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[8:9] @@ -257,16 +255,16 @@ define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX11-LABEL: fshr_v2i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x3c -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x3c +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3 -; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_alignbit_b32 v1, s5, s7, v0 ; GFX11-NEXT: v_alignbit_b32 v0, s4, s6, v2 -; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] +; GFX11-NEXT: global_store_b64 v3, v[0:1], s[2:3] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -279,8 +277,8 @@ entry: define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y) { ; SI-LABEL: fshr_v2i32_imm: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -293,8 +291,8 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; ; VI-LABEL: fshr_v2i32_imm: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s7 ; VI-NEXT: v_mov_b32_e32 v2, s6 @@ -307,15 +305,15 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; ; GFX9-LABEL: fshr_v2i32_imm: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s7 ; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 9 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v3, 7 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshr_v2i32_imm: @@ -335,20 +333,20 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; GFX10-LABEL: fshr_v2i32_imm: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_alignbit_b32 v1, s5, s7, 9 ; GFX10-NEXT: v_alignbit_b32 v0, s4, s6, 7 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshr_v2i32_imm: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_alignbit_b32 v1, s5, s7, 9 @@ -366,11 +364,11 @@ entry: define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; SI-LABEL: fshr_v4i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd -; SI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x15 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x15 +; SI-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s19, 0xf000 +; SI-NEXT: s_mov_b32 s18, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s11 ; SI-NEXT: v_mov_b32_e32 v1, s15 @@ -384,14 +382,14 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v4, s12 ; SI-NEXT: v_alignbit_b32 v0, s4, v0, v4 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshr_v4i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x54 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s11 ; VI-NEXT: v_mov_b32_e32 v1, s15 @@ -412,10 +410,10 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; ; GFX9-LABEL: fshr_v4i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x54 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s11 ; GFX9-NEXT: v_mov_b32_e32 v1, s15 @@ -453,9 +451,9 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX10-LABEL: fshr_v4i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x54 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s15 @@ -466,15 +464,15 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX10-NEXT: v_alignbit_b32 v2, s6, s10, v1 ; GFX10-NEXT: v_alignbit_b32 v1, s5, s9, v4 ; GFX10-NEXT: v_alignbit_b32 v0, s4, s8, v5 -; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[2:3] +; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshr_v4i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b128 s[12:15], s[0:1], 0x54 -; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[12:15], s[2:3], 0x54 +; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v6, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s15 :: v_dual_mov_b32 v1, s14 @@ -498,10 +496,10 @@ entry: define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 x i32> %y) { ; SI-LABEL: fshr_v4i32_imm: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s15, 0xf000 +; SI-NEXT: s_mov_b32 s14, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s11 ; SI-NEXT: v_mov_b32_e32 v1, s10 @@ -511,13 +509,13 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; SI-NEXT: v_alignbit_b32 v1, s5, v0, 7 ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_alignbit_b32 v0, s4, v0, 1 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshr_v4i32_imm: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s11 ; VI-NEXT: v_mov_b32_e32 v1, s10 @@ -534,9 +532,9 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; ; GFX9-LABEL: fshr_v4i32_imm: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s11 ; GFX9-NEXT: v_mov_b32_e32 v1, s10 @@ -568,22 +566,22 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; GFX10-LABEL: fshr_v4i32_imm: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_alignbit_b32 v3, s7, s11, 1 ; GFX10-NEXT: v_alignbit_b32 v2, s6, s10, 9 ; GFX10-NEXT: v_alignbit_b32 v1, s5, s9, 7 ; GFX10-NEXT: v_alignbit_b32 v0, s4, s8, 1 -; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshr_v4i32_imm: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_alignbit_b32 v3, s7, s11, 1 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics.ll b/llvm/test/CodeGen/AMDGPU/global_atomics.ll index d4398e5367c7f..fe6d467bb6281 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics.ll @@ -6,8 +6,8 @@ define amdgpu_kernel void @atomic_add_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_add_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -19,8 +19,8 @@ define amdgpu_kernel void @atomic_add_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; VI-LABEL: atomic_add_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -32,12 +32,12 @@ define amdgpu_kernel void @atomic_add_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; GFX9-LABEL: atomic_add_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_add v0, v1, s[2:3] offset:16 +; GFX9-NEXT: global_atomic_add v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -50,8 +50,8 @@ entry: define amdgpu_kernel void @atomic_add_i32_max_neg_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_add_i32_max_neg_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_mov_b32_e32 v0, 0xfffff000 @@ -65,14 +65,14 @@ define amdgpu_kernel void @atomic_add_i32_max_neg_offset(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_add_i32_max_neg_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s2, 0xfffff000 -; VI-NEXT: s_addc_u32 s1, s3, -1 +; VI-NEXT: s_add_u32 s0, s0, 0xfffff000 +; VI-NEXT: s_addc_u32 s1, s1, -1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_add v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -80,12 +80,12 @@ define amdgpu_kernel void @atomic_add_i32_max_neg_offset(ptr addrspace(1) %out, ; ; GFX9-LABEL: atomic_add_i32_max_neg_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_add v0, v1, s[2:3] offset:-4096 +; GFX9-NEXT: global_atomic_add v0, v1, s[0:1] offset:-4096 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -98,8 +98,8 @@ entry: define amdgpu_kernel void @atomic_add_i32_soffset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_add_i32_soffset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s5, 0x8ca0 @@ -112,8 +112,8 @@ define amdgpu_kernel void @atomic_add_i32_soffset(ptr addrspace(1) %out, i32 %in ; ; VI-LABEL: atomic_add_i32_soffset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s5, 0x8ca0 @@ -126,12 +126,12 @@ define amdgpu_kernel void @atomic_add_i32_soffset(ptr addrspace(1) %out, i32 %in ; ; GFX9-LABEL: atomic_add_i32_soffset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x8000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_add v0, v1, s[2:3] offset:3232 +; GFX9-NEXT: global_atomic_add v0, v1, s[0:1] offset:3232 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -144,29 +144,29 @@ entry: define amdgpu_kernel void @atomic_add_i32_huge_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_add_i32_huge_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_mov_b32_e32 v0, 0xdeac ; SI-NEXT: v_mov_b32_e32 v1, 0xabcd ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s0 -; SI-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: buffer_atomic_add v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_add_i32_huge_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s2, 0xdeac -; VI-NEXT: s_addc_u32 s1, s3, 0xabcd +; VI-NEXT: s_add_u32 s0, s0, 0xdeac +; VI-NEXT: s_addc_u32 s1, s1, 0xabcd ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_add v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -174,12 +174,12 @@ define amdgpu_kernel void @atomic_add_i32_huge_offset(ptr addrspace(1) %out, i32 ; ; GFX9-LABEL: atomic_add_i32_huge_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_u32 s0, s2, 0xdeac -; GFX9-NEXT: s_addc_u32 s1, s3, 0xabcd +; GFX9-NEXT: s_add_u32 s0, s0, 0xdeac +; GFX9-NEXT: s_addc_u32 s1, s1, 0xabcd ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: global_atomic_add v0, v1, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -195,8 +195,8 @@ entry: define amdgpu_kernel void @atomic_add_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_add_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dword s8, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -213,29 +213,29 @@ define amdgpu_kernel void @atomic_add_i32_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_add_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: s_mov_b32 s8, s6 +; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: buffer_atomic_add v0, off, s[4:7], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_add_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_atomic_add v1, v0, v1, s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -251,9 +251,9 @@ entry: define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_add_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_load_dword s6, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dword s6, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -268,18 +268,18 @@ define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_add_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s6, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_add v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -287,12 +287,12 @@ define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_add_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -310,9 +310,9 @@ entry: define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_add_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf -; SI-NEXT: s_load_dword s2, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf +; SI-NEXT: s_load_dword s2, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -332,22 +332,22 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_add_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_add v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -356,12 +356,12 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_add_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -381,8 +381,8 @@ entry: define amdgpu_kernel void @atomic_add_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_add_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -394,8 +394,8 @@ define amdgpu_kernel void @atomic_add_i32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: atomic_add_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -407,12 +407,12 @@ define amdgpu_kernel void @atomic_add_i32(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: atomic_add_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_add v0, v1, s[2:3] +; GFX9-NEXT: global_atomic_add v0, v1, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -424,8 +424,8 @@ entry: define amdgpu_kernel void @atomic_add_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_add_i32_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dword s8, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -442,8 +442,8 @@ define amdgpu_kernel void @atomic_add_i32_ret(ptr addrspace(1) %out, ptr addrspa ; ; VI-LABEL: atomic_add_i32_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s8, s[2:3], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -460,11 +460,11 @@ define amdgpu_kernel void @atomic_add_i32_ret(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: atomic_add_i32_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_atomic_add v1, v0, v1, s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -479,9 +479,9 @@ entry: define amdgpu_kernel void @atomic_add_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_add_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_load_dword s6, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dword s6, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -496,16 +496,16 @@ define amdgpu_kernel void @atomic_add_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; VI-LABEL: atomic_add_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s6, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_add v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -513,12 +513,12 @@ define amdgpu_kernel void @atomic_add_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; GFX9-LABEL: atomic_add_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -535,9 +535,9 @@ entry: define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_add_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf -; SI-NEXT: s_load_dword s2, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf +; SI-NEXT: s_load_dword s2, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -557,20 +557,20 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_add_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_add v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -579,12 +579,12 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_add_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -603,8 +603,8 @@ entry: define amdgpu_kernel void @atomic_and_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_and_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -616,8 +616,8 @@ define amdgpu_kernel void @atomic_and_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; VI-LABEL: atomic_and_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -629,12 +629,12 @@ define amdgpu_kernel void @atomic_and_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; GFX9-LABEL: atomic_and_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_and v0, v1, s[2:3] offset:16 +; GFX9-NEXT: global_atomic_and v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -647,8 +647,8 @@ entry: define amdgpu_kernel void @atomic_and_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_and_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dword s8, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -665,29 +665,29 @@ define amdgpu_kernel void @atomic_and_i32_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_and_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: s_mov_b32 s8, s6 +; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: buffer_atomic_and v0, off, s[4:7], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_and_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_atomic_and v1, v0, v1, s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -703,9 +703,9 @@ entry: define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_and_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_load_dword s6, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dword s6, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -720,18 +720,18 @@ define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_and_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s6, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_and v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -739,12 +739,12 @@ define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_and_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -762,9 +762,9 @@ entry: define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_and_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf -; SI-NEXT: s_load_dword s2, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf +; SI-NEXT: s_load_dword s2, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -784,22 +784,22 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_and_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_and v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -808,12 +808,12 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_and_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -833,8 +833,8 @@ entry: define amdgpu_kernel void @atomic_and_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_and_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -846,8 +846,8 @@ define amdgpu_kernel void @atomic_and_i32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: atomic_and_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -859,12 +859,12 @@ define amdgpu_kernel void @atomic_and_i32(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: atomic_and_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_and v0, v1, s[2:3] +; GFX9-NEXT: global_atomic_and v0, v1, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -876,8 +876,8 @@ entry: define amdgpu_kernel void @atomic_and_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_and_i32_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dword s8, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -894,8 +894,8 @@ define amdgpu_kernel void @atomic_and_i32_ret(ptr addrspace(1) %out, ptr addrspa ; ; VI-LABEL: atomic_and_i32_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s8, s[2:3], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -912,11 +912,11 @@ define amdgpu_kernel void @atomic_and_i32_ret(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: atomic_and_i32_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_atomic_and v1, v0, v1, s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -931,9 +931,9 @@ entry: define amdgpu_kernel void @atomic_and_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_and_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_load_dword s6, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dword s6, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -948,16 +948,16 @@ define amdgpu_kernel void @atomic_and_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; VI-LABEL: atomic_and_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s6, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_and v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -965,12 +965,12 @@ define amdgpu_kernel void @atomic_and_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; GFX9-LABEL: atomic_and_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -987,9 +987,9 @@ entry: define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_and_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf -; SI-NEXT: s_load_dword s2, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf +; SI-NEXT: s_load_dword s2, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -1009,20 +1009,20 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_and_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_and v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -1031,12 +1031,12 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_and_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -1055,8 +1055,8 @@ entry: define amdgpu_kernel void @atomic_sub_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_sub_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1068,8 +1068,8 @@ define amdgpu_kernel void @atomic_sub_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; VI-LABEL: atomic_sub_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1081,12 +1081,12 @@ define amdgpu_kernel void @atomic_sub_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; GFX9-LABEL: atomic_sub_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_sub v0, v1, s[2:3] offset:16 +; GFX9-NEXT: global_atomic_sub v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -1099,8 +1099,8 @@ entry: define amdgpu_kernel void @atomic_sub_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_sub_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dword s8, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1117,29 +1117,29 @@ define amdgpu_kernel void @atomic_sub_i32_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_sub_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: s_mov_b32 s8, s6 +; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_sub_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_atomic_sub v1, v0, v1, s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -1155,9 +1155,9 @@ entry: define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_sub_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_load_dword s6, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dword s6, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -1172,18 +1172,18 @@ define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_sub_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s6, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_sub v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -1191,12 +1191,12 @@ define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_sub_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -1214,9 +1214,9 @@ entry: define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_sub_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf -; SI-NEXT: s_load_dword s2, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf +; SI-NEXT: s_load_dword s2, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -1236,22 +1236,22 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_sub_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_sub v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -1260,12 +1260,12 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_sub_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -1285,8 +1285,8 @@ entry: define amdgpu_kernel void @atomic_sub_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_sub_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1298,8 +1298,8 @@ define amdgpu_kernel void @atomic_sub_i32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: atomic_sub_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1311,12 +1311,12 @@ define amdgpu_kernel void @atomic_sub_i32(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: atomic_sub_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_sub v0, v1, s[2:3] +; GFX9-NEXT: global_atomic_sub v0, v1, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -1328,8 +1328,8 @@ entry: define amdgpu_kernel void @atomic_sub_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_sub_i32_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dword s8, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1346,8 +1346,8 @@ define amdgpu_kernel void @atomic_sub_i32_ret(ptr addrspace(1) %out, ptr addrspa ; ; VI-LABEL: atomic_sub_i32_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s8, s[2:3], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1364,11 +1364,11 @@ define amdgpu_kernel void @atomic_sub_i32_ret(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: atomic_sub_i32_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_atomic_sub v1, v0, v1, s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -1383,9 +1383,9 @@ entry: define amdgpu_kernel void @atomic_sub_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_sub_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_load_dword s6, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dword s6, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -1400,16 +1400,16 @@ define amdgpu_kernel void @atomic_sub_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; VI-LABEL: atomic_sub_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s6, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_sub v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -1417,12 +1417,12 @@ define amdgpu_kernel void @atomic_sub_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; GFX9-LABEL: atomic_sub_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -1439,9 +1439,9 @@ entry: define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_sub_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf -; SI-NEXT: s_load_dword s2, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf +; SI-NEXT: s_load_dword s2, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -1461,20 +1461,20 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_sub_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_sub v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -1483,12 +1483,12 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_sub_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -1507,8 +1507,8 @@ entry: define amdgpu_kernel void @atomic_max_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_max_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1520,8 +1520,8 @@ define amdgpu_kernel void @atomic_max_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; VI-LABEL: atomic_max_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1533,12 +1533,12 @@ define amdgpu_kernel void @atomic_max_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; GFX9-LABEL: atomic_max_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_smax v0, v1, s[2:3] offset:16 +; GFX9-NEXT: global_atomic_smax v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -1551,8 +1551,8 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_max_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dword s8, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1568,28 +1568,28 @@ define amdgpu_kernel void @atomic_max_i32_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_max_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: s_mov_b32 s8, s6 +; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: buffer_atomic_smax v0, off, s[4:7], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_max_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_atomic_smax v1, v0, v1, s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[6:7] @@ -1604,9 +1604,9 @@ entry: define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_max_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_load_dword s6, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dword s6, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -1619,29 +1619,29 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_max_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s6, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_smax v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_max_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -1657,9 +1657,9 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_max_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf -; SI-NEXT: s_load_dword s2, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf +; SI-NEXT: s_load_dword s2, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -1678,20 +1678,20 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_max_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_smax v0, v[0:1], v2 glc -; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -1701,12 +1701,12 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_max_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -1725,8 +1725,8 @@ entry: define amdgpu_kernel void @atomic_max_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_max_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1736,8 +1736,8 @@ define amdgpu_kernel void @atomic_max_i32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: atomic_max_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1747,12 +1747,12 @@ define amdgpu_kernel void @atomic_max_i32(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: atomic_max_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_smax v0, v1, s[2:3] +; GFX9-NEXT: global_atomic_smax v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm entry: %val = atomicrmw volatile max ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst @@ -1762,8 +1762,8 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_max_i32_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dword s8, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1779,8 +1779,8 @@ define amdgpu_kernel void @atomic_max_i32_ret(ptr addrspace(1) %out, ptr addrspa ; ; VI-LABEL: atomic_max_i32_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s8, s[2:3], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1796,11 +1796,11 @@ define amdgpu_kernel void @atomic_max_i32_ret(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: atomic_max_i32_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_atomic_smax v1, v0, v1, s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[6:7] @@ -1814,9 +1814,9 @@ entry: define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_max_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_load_dword s6, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dword s6, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -1829,27 +1829,27 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; VI-LABEL: atomic_max_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s6, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_smax v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_max_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -1864,9 +1864,9 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_max_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf -; SI-NEXT: s_load_dword s2, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf +; SI-NEXT: s_load_dword s2, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -1885,18 +1885,18 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_max_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_smax v0, v[0:1], v2 glc -; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -1906,12 +1906,12 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_max_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -1929,8 +1929,8 @@ entry: define amdgpu_kernel void @atomic_umax_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_umax_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1940,8 +1940,8 @@ define amdgpu_kernel void @atomic_umax_i32_offset(ptr addrspace(1) %out, i32 %in ; ; VI-LABEL: atomic_umax_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1951,12 +1951,12 @@ define amdgpu_kernel void @atomic_umax_i32_offset(ptr addrspace(1) %out, i32 %in ; ; GFX9-LABEL: atomic_umax_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_umax v0, v1, s[2:3] offset:16 +; GFX9-NEXT: global_atomic_umax v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 @@ -1967,8 +1967,8 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_umax_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dword s8, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1984,28 +1984,28 @@ define amdgpu_kernel void @atomic_umax_i32_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_umax_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: s_mov_b32 s8, s6 +; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: buffer_atomic_umax v0, off, s[4:7], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umax_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_atomic_umax v1, v0, v1, s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[6:7] @@ -2020,9 +2020,9 @@ entry: define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_umax_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_load_dword s6, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dword s6, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -2035,29 +2035,29 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_umax_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s6, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_umax v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umax_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -2073,9 +2073,9 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_umax_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf -; SI-NEXT: s_load_dword s2, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf +; SI-NEXT: s_load_dword s2, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -2094,20 +2094,20 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o ; ; VI-LABEL: atomic_umax_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_umax v0, v[0:1], v2 glc -; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -2117,12 +2117,12 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o ; ; GFX9-LABEL: atomic_umax_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -2141,8 +2141,8 @@ entry: define amdgpu_kernel void @atomic_umax_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_umax_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2152,8 +2152,8 @@ define amdgpu_kernel void @atomic_umax_i32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: atomic_umax_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2163,12 +2163,12 @@ define amdgpu_kernel void @atomic_umax_i32(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: atomic_umax_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_umax v0, v1, s[2:3] +; GFX9-NEXT: global_atomic_umax v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm entry: %val = atomicrmw volatile umax ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst @@ -2178,8 +2178,8 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_umax_i32_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dword s8, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2195,8 +2195,8 @@ define amdgpu_kernel void @atomic_umax_i32_ret(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: atomic_umax_i32_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s8, s[2:3], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2212,11 +2212,11 @@ define amdgpu_kernel void @atomic_umax_i32_ret(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: atomic_umax_i32_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_atomic_umax v1, v0, v1, s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[6:7] @@ -2230,9 +2230,9 @@ entry: define amdgpu_kernel void @atomic_umax_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_umax_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_load_dword s6, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dword s6, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -2245,27 +2245,27 @@ define amdgpu_kernel void @atomic_umax_i32_addr64(ptr addrspace(1) %out, i32 %in ; ; VI-LABEL: atomic_umax_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s6, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_umax v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umax_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -2280,9 +2280,9 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_umax_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf -; SI-NEXT: s_load_dword s2, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf +; SI-NEXT: s_load_dword s2, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -2301,18 +2301,18 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_umax_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_umax v0, v[0:1], v2 glc -; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -2322,12 +2322,12 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_umax_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -2345,8 +2345,8 @@ entry: define amdgpu_kernel void @atomic_min_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_min_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2356,8 +2356,8 @@ define amdgpu_kernel void @atomic_min_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; VI-LABEL: atomic_min_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2367,12 +2367,12 @@ define amdgpu_kernel void @atomic_min_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; GFX9-LABEL: atomic_min_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_smin v0, v1, s[2:3] offset:16 +; GFX9-NEXT: global_atomic_smin v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 @@ -2383,8 +2383,8 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_min_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dword s8, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2400,28 +2400,28 @@ define amdgpu_kernel void @atomic_min_i32_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_min_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: s_mov_b32 s8, s6 +; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: buffer_atomic_smin v0, off, s[4:7], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_min_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_atomic_smin v1, v0, v1, s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[6:7] @@ -2436,9 +2436,9 @@ entry: define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_min_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_load_dword s6, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dword s6, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -2451,29 +2451,29 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_min_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s6, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_smin v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_min_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -2489,9 +2489,9 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_min_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf -; SI-NEXT: s_load_dword s2, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf +; SI-NEXT: s_load_dword s2, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -2510,20 +2510,20 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_min_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_smin v0, v[0:1], v2 glc -; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -2533,12 +2533,12 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_min_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -2557,8 +2557,8 @@ entry: define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_min_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2568,8 +2568,8 @@ define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: atomic_min_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2579,12 +2579,12 @@ define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: atomic_min_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_smin v0, v1, s[2:3] +; GFX9-NEXT: global_atomic_smin v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm entry: %val = atomicrmw volatile min ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst @@ -2594,8 +2594,8 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_min_i32_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dword s8, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2611,8 +2611,8 @@ define amdgpu_kernel void @atomic_min_i32_ret(ptr addrspace(1) %out, ptr addrspa ; ; VI-LABEL: atomic_min_i32_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s8, s[2:3], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2628,11 +2628,11 @@ define amdgpu_kernel void @atomic_min_i32_ret(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: atomic_min_i32_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_atomic_smin v1, v0, v1, s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[6:7] @@ -2646,9 +2646,9 @@ entry: define amdgpu_kernel void @atomic_min_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_min_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_load_dword s6, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dword s6, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -2661,27 +2661,27 @@ define amdgpu_kernel void @atomic_min_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; VI-LABEL: atomic_min_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s6, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_smin v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_min_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -2696,9 +2696,9 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_min_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf -; SI-NEXT: s_load_dword s2, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf +; SI-NEXT: s_load_dword s2, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -2717,18 +2717,18 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_min_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_smin v0, v[0:1], v2 glc -; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -2738,12 +2738,12 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_min_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -2761,8 +2761,8 @@ entry: define amdgpu_kernel void @atomic_umin_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_umin_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2772,8 +2772,8 @@ define amdgpu_kernel void @atomic_umin_i32_offset(ptr addrspace(1) %out, i32 %in ; ; VI-LABEL: atomic_umin_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2783,12 +2783,12 @@ define amdgpu_kernel void @atomic_umin_i32_offset(ptr addrspace(1) %out, i32 %in ; ; GFX9-LABEL: atomic_umin_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_umin v0, v1, s[2:3] offset:16 +; GFX9-NEXT: global_atomic_umin v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 @@ -2799,8 +2799,8 @@ entry: define amdgpu_kernel void @atomic_umin_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_umin_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dword s8, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2816,28 +2816,28 @@ define amdgpu_kernel void @atomic_umin_i32_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_umin_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: s_mov_b32 s8, s6 +; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: buffer_atomic_umin v0, off, s[4:7], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umin_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_atomic_umin v1, v0, v1, s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[6:7] @@ -2852,9 +2852,9 @@ entry: define amdgpu_kernel void @atomic_umin_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_umin_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_load_dword s6, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dword s6, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -2867,29 +2867,29 @@ define amdgpu_kernel void @atomic_umin_i32_addr64_offset(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_umin_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s6, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_umin v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umin_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -2905,9 +2905,9 @@ entry: define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_umin_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf -; SI-NEXT: s_load_dword s2, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf +; SI-NEXT: s_load_dword s2, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -2926,20 +2926,20 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr addrspace(1) %o ; ; VI-LABEL: atomic_umin_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_umin v0, v[0:1], v2 glc -; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -2949,12 +2949,12 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr addrspace(1) %o ; ; GFX9-LABEL: atomic_umin_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -2973,8 +2973,8 @@ entry: define amdgpu_kernel void @atomic_umin_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_umin_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2984,8 +2984,8 @@ define amdgpu_kernel void @atomic_umin_i32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: atomic_umin_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2995,12 +2995,12 @@ define amdgpu_kernel void @atomic_umin_i32(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: atomic_umin_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_umin v0, v1, s[2:3] +; GFX9-NEXT: global_atomic_umin v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm entry: %val = atomicrmw volatile umin ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst @@ -3010,8 +3010,8 @@ entry: define amdgpu_kernel void @atomic_umin_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_umin_i32_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dword s8, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3027,8 +3027,8 @@ define amdgpu_kernel void @atomic_umin_i32_ret(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: atomic_umin_i32_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s8, s[2:3], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3044,11 +3044,11 @@ define amdgpu_kernel void @atomic_umin_i32_ret(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: atomic_umin_i32_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_atomic_umin v1, v0, v1, s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[6:7] @@ -3062,9 +3062,9 @@ entry: define amdgpu_kernel void @atomic_umin_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_umin_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_load_dword s6, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dword s6, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -3077,27 +3077,27 @@ define amdgpu_kernel void @atomic_umin_i32_addr64(ptr addrspace(1) %out, i32 %in ; ; VI-LABEL: atomic_umin_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s6, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_umin v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umin_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -3112,9 +3112,9 @@ entry: define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_umin_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf -; SI-NEXT: s_load_dword s2, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf +; SI-NEXT: s_load_dword s2, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -3133,18 +3133,18 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_umin_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_umin v0, v[0:1], v2 glc -; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -3154,12 +3154,12 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_umin_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -3177,8 +3177,8 @@ entry: define amdgpu_kernel void @atomic_or_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_or_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3190,8 +3190,8 @@ define amdgpu_kernel void @atomic_or_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; VI-LABEL: atomic_or_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3203,12 +3203,12 @@ define amdgpu_kernel void @atomic_or_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; GFX9-LABEL: atomic_or_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_or v0, v1, s[2:3] offset:16 +; GFX9-NEXT: global_atomic_or v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -3221,8 +3221,8 @@ entry: define amdgpu_kernel void @atomic_or_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_or_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dword s8, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3239,29 +3239,29 @@ define amdgpu_kernel void @atomic_or_i32_ret_offset(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: atomic_or_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: s_mov_b32 s8, s6 +; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: buffer_atomic_or v0, off, s[4:7], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_or_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_atomic_or v1, v0, v1, s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -3277,9 +3277,9 @@ entry: define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_or_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_load_dword s6, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dword s6, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -3294,18 +3294,18 @@ define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr addrspace(1) %out, i3 ; ; VI-LABEL: atomic_or_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s6, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_or v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -3313,12 +3313,12 @@ define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr addrspace(1) %out, i3 ; ; GFX9-LABEL: atomic_or_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -3336,9 +3336,9 @@ entry: define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_or_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf -; SI-NEXT: s_load_dword s2, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf +; SI-NEXT: s_load_dword s2, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -3358,22 +3358,22 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(ptr addrspace(1) %out ; ; VI-LABEL: atomic_or_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_or v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -3382,12 +3382,12 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(ptr addrspace(1) %out ; ; GFX9-LABEL: atomic_or_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -3407,8 +3407,8 @@ entry: define amdgpu_kernel void @atomic_or_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_or_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3420,8 +3420,8 @@ define amdgpu_kernel void @atomic_or_i32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: atomic_or_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3433,12 +3433,12 @@ define amdgpu_kernel void @atomic_or_i32(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: atomic_or_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_or v0, v1, s[2:3] +; GFX9-NEXT: global_atomic_or v0, v1, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -3450,8 +3450,8 @@ entry: define amdgpu_kernel void @atomic_or_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_or_i32_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dword s8, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3468,8 +3468,8 @@ define amdgpu_kernel void @atomic_or_i32_ret(ptr addrspace(1) %out, ptr addrspac ; ; VI-LABEL: atomic_or_i32_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s8, s[2:3], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3486,11 +3486,11 @@ define amdgpu_kernel void @atomic_or_i32_ret(ptr addrspace(1) %out, ptr addrspac ; ; GFX9-LABEL: atomic_or_i32_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_atomic_or v1, v0, v1, s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -3505,9 +3505,9 @@ entry: define amdgpu_kernel void @atomic_or_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_or_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_load_dword s6, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dword s6, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -3522,16 +3522,16 @@ define amdgpu_kernel void @atomic_or_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; VI-LABEL: atomic_or_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s6, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_or v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -3539,12 +3539,12 @@ define amdgpu_kernel void @atomic_or_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; GFX9-LABEL: atomic_or_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -3561,9 +3561,9 @@ entry: define amdgpu_kernel void @atomic_or_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_or_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf -; SI-NEXT: s_load_dword s2, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf +; SI-NEXT: s_load_dword s2, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -3583,20 +3583,20 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: atomic_or_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_or v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -3605,12 +3605,12 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64(ptr addrspace(1) %out, ptr a ; ; GFX9-LABEL: atomic_or_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -3629,8 +3629,8 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_xchg_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3642,8 +3642,8 @@ define amdgpu_kernel void @atomic_xchg_i32_offset(ptr addrspace(1) %out, i32 %in ; ; VI-LABEL: atomic_xchg_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3655,12 +3655,12 @@ define amdgpu_kernel void @atomic_xchg_i32_offset(ptr addrspace(1) %out, i32 %in ; ; GFX9-LABEL: atomic_xchg_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_swap v0, v1, s[2:3] offset:16 +; GFX9-NEXT: global_atomic_swap v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -3673,8 +3673,8 @@ entry: define amdgpu_kernel void @atomic_xchg_f32_offset(ptr addrspace(1) %out, float %in) { ; SI-LABEL: atomic_xchg_f32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3686,8 +3686,8 @@ define amdgpu_kernel void @atomic_xchg_f32_offset(ptr addrspace(1) %out, float % ; ; VI-LABEL: atomic_xchg_f32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3699,12 +3699,12 @@ define amdgpu_kernel void @atomic_xchg_f32_offset(ptr addrspace(1) %out, float % ; ; GFX9-LABEL: atomic_xchg_f32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_swap v0, v1, s[2:3] offset:16 +; GFX9-NEXT: global_atomic_swap v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -3717,8 +3717,8 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_xchg_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dword s8, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3735,29 +3735,29 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_xchg_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: s_mov_b32 s8, s6 +; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_xchg_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_atomic_swap v1, v0, v1, s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -3773,9 +3773,9 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_xchg_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_load_dword s6, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dword s6, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -3790,18 +3790,18 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_xchg_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s6, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_swap v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -3809,12 +3809,12 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(ptr addrspace(1) %out, ; ; GFX9-LABEL: atomic_xchg_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -3832,9 +3832,9 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_xchg_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf -; SI-NEXT: s_load_dword s2, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf +; SI-NEXT: s_load_dword s2, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -3854,22 +3854,22 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(ptr addrspace(1) %o ; ; VI-LABEL: atomic_xchg_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_swap v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -3878,12 +3878,12 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(ptr addrspace(1) %o ; ; GFX9-LABEL: atomic_xchg_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -3903,8 +3903,8 @@ entry: define amdgpu_kernel void @atomic_xchg_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_xchg_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3916,8 +3916,8 @@ define amdgpu_kernel void @atomic_xchg_i32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: atomic_xchg_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3929,12 +3929,12 @@ define amdgpu_kernel void @atomic_xchg_i32(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: atomic_xchg_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX9-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -3946,8 +3946,8 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_xchg_i32_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dword s8, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3964,8 +3964,8 @@ define amdgpu_kernel void @atomic_xchg_i32_ret(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: atomic_xchg_i32_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s8, s[2:3], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3982,11 +3982,11 @@ define amdgpu_kernel void @atomic_xchg_i32_ret(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: atomic_xchg_i32_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -4001,9 +4001,9 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_xchg_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_load_dword s6, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dword s6, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -4018,16 +4018,16 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64(ptr addrspace(1) %out, i32 %in ; ; VI-LABEL: atomic_xchg_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s6, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_swap v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -4035,12 +4035,12 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64(ptr addrspace(1) %out, i32 %in ; ; GFX9-LABEL: atomic_xchg_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -4057,9 +4057,9 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_xchg_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf -; SI-NEXT: s_load_dword s2, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf +; SI-NEXT: s_load_dword s2, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -4079,20 +4079,20 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_xchg_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_swap v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -4101,12 +4101,12 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_xchg_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -4125,7 +4125,7 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr addrspace(1) %out, i32 %in, i32 %old) { ; SI-LABEL: atomic_cmpxchg_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4140,7 +4140,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr addrspace(1) %out, i32 ; ; VI-LABEL: atomic_cmpxchg_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -4155,7 +4155,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr addrspace(1) %out, i32 ; ; GFX9-LABEL: atomic_cmpxchg_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 @@ -4173,8 +4173,8 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %old) { ; SI-LABEL: atomic_cmpxchg_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4192,31 +4192,31 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_cmpxchg_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: s_mov_b32 s8, s6 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_cmpxchg_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -4233,10 +4233,10 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index, i32 %old) { ; SI-LABEL: atomic_cmpxchg_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s6, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_load_dword s7, s[0:1], 0xf -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s6, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dword s7, s[2:3], 0xf +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -4252,19 +4252,19 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_cmpxchg_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; VI-NEXT: s_load_dword s6, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s7, s[0:1], 0x3c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dword s6, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x3c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_cmpswap v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -4273,17 +4273,17 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_cmpxchg_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c -; GFX9-NEXT: s_load_dword s7, s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_load_dword s2, s[2:3], 0x3c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -4298,10 +4298,10 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index, i32 %old) { ; SI-LABEL: atomic_cmpxchg_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s2, s[0:1], 0xd -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf -; SI-NEXT: s_load_dword s10, s[0:1], 0x11 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dword s10, s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf +; SI-NEXT: s_load_dword s2, s[2:3], 0x11 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -4309,8 +4309,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr addrspace(1) ; SI-NEXT: s_lshl_b64 s[8:9], s[8:9], 2 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v1, s10 +; SI-NEXT: v_mov_b32_e32 v0, s10 +; SI-NEXT: v_mov_b32_e32 v1, s2 ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_mov_b32_e32 v3, s9 ; SI-NEXT: buffer_atomic_cmpswap v[0:1], v[2:3], s[4:7], 0 addr64 offset:16 glc @@ -4322,24 +4322,24 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr addrspace(1) ; ; VI-LABEL: atomic_cmpxchg_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s9, s[0:1], 0x44 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; VI-NEXT: s_load_dword s8, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x44 +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -4348,13 +4348,13 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr addrspace(1) ; ; GFX9-LABEL: atomic_cmpxchg_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 -; GFX9-NEXT: s_load_dword s9, s[0:1], 0x44 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dword s9, s[2:3], 0x44 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, s8 @@ -4376,7 +4376,7 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32(ptr addrspace(1) %out, i32 %in, i32 %old) { ; SI-LABEL: atomic_cmpxchg_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4391,7 +4391,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32(ptr addrspace(1) %out, i32 %in, i3 ; ; VI-LABEL: atomic_cmpxchg_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -4406,7 +4406,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32(ptr addrspace(1) %out, i32 %in, i3 ; ; GFX9-LABEL: atomic_cmpxchg_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 @@ -4423,8 +4423,8 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %old) { ; SI-LABEL: atomic_cmpxchg_i32_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4442,8 +4442,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret(ptr addrspace(1) %out, ptr add ; ; VI-LABEL: atomic_cmpxchg_i32_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -4461,12 +4461,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: atomic_cmpxchg_i32_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -4482,10 +4482,10 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index, i32 %old) { ; SI-LABEL: atomic_cmpxchg_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s6, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_load_dword s7, s[0:1], 0xf -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s6, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dword s7, s[2:3], 0xf +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -4501,17 +4501,17 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr addrspace(1) %out, i32 ; ; VI-LABEL: atomic_cmpxchg_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; VI-NEXT: s_load_dword s6, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s7, s[0:1], 0x3c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dword s6, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x3c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_cmpswap v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -4520,17 +4520,17 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr addrspace(1) %out, i32 ; ; GFX9-LABEL: atomic_cmpxchg_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c -; GFX9-NEXT: s_load_dword s7, s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_load_dword s2, s[2:3], 0x3c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -4544,10 +4544,10 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index, i32 %old) { ; SI-LABEL: atomic_cmpxchg_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s2, s[0:1], 0xd -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf -; SI-NEXT: s_load_dword s10, s[0:1], 0x11 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dword s10, s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf +; SI-NEXT: s_load_dword s2, s[2:3], 0x11 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -4555,8 +4555,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr addrspace(1) %out, ; SI-NEXT: s_lshl_b64 s[8:9], s[8:9], 2 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v1, s10 +; SI-NEXT: v_mov_b32_e32 v0, s10 +; SI-NEXT: v_mov_b32_e32 v1, s2 ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_mov_b32_e32 v3, s9 ; SI-NEXT: buffer_atomic_cmpswap v[0:1], v[2:3], s[4:7], 0 addr64 glc @@ -4568,22 +4568,22 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_cmpxchg_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s9, s[0:1], 0x44 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; VI-NEXT: s_load_dword s8, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x44 +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -4592,13 +4592,13 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr addrspace(1) %out, ; ; GFX9-LABEL: atomic_cmpxchg_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 -; GFX9-NEXT: s_load_dword s9, s[0:1], 0x44 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dword s9, s[2:3], 0x44 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, s8 @@ -4619,8 +4619,8 @@ entry: define amdgpu_kernel void @atomic_xor_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_xor_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4632,8 +4632,8 @@ define amdgpu_kernel void @atomic_xor_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; VI-LABEL: atomic_xor_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -4645,12 +4645,12 @@ define amdgpu_kernel void @atomic_xor_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; GFX9-LABEL: atomic_xor_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_xor v0, v1, s[2:3] offset:16 +; GFX9-NEXT: global_atomic_xor v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -4663,8 +4663,8 @@ entry: define amdgpu_kernel void @atomic_xor_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_xor_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dword s8, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4681,29 +4681,29 @@ define amdgpu_kernel void @atomic_xor_i32_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_xor_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: s_mov_b32 s8, s6 +; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: buffer_atomic_xor v0, off, s[4:7], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_xor_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_atomic_xor v1, v0, v1, s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -4719,9 +4719,9 @@ entry: define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_xor_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_load_dword s6, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dword s6, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -4736,18 +4736,18 @@ define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_xor_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s6, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_xor v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -4755,12 +4755,12 @@ define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_xor_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -4778,9 +4778,9 @@ entry: define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_xor_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf -; SI-NEXT: s_load_dword s2, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf +; SI-NEXT: s_load_dword s2, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -4800,22 +4800,22 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_xor_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_xor v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -4824,12 +4824,12 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_xor_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -4849,8 +4849,8 @@ entry: define amdgpu_kernel void @atomic_xor_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_xor_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4862,8 +4862,8 @@ define amdgpu_kernel void @atomic_xor_i32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: atomic_xor_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -4875,12 +4875,12 @@ define amdgpu_kernel void @atomic_xor_i32(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: atomic_xor_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_xor v0, v1, s[2:3] +; GFX9-NEXT: global_atomic_xor v0, v1, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -4892,8 +4892,8 @@ entry: define amdgpu_kernel void @atomic_xor_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_xor_i32_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dword s8, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4910,8 +4910,8 @@ define amdgpu_kernel void @atomic_xor_i32_ret(ptr addrspace(1) %out, ptr addrspa ; ; VI-LABEL: atomic_xor_i32_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s8, s[2:3], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -4928,11 +4928,11 @@ define amdgpu_kernel void @atomic_xor_i32_ret(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: atomic_xor_i32_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_atomic_xor v1, v0, v1, s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -4947,9 +4947,9 @@ entry: define amdgpu_kernel void @atomic_xor_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_xor_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_load_dword s6, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dword s6, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -4964,16 +4964,16 @@ define amdgpu_kernel void @atomic_xor_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; VI-LABEL: atomic_xor_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s6, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_xor v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -4981,12 +4981,12 @@ define amdgpu_kernel void @atomic_xor_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; GFX9-LABEL: atomic_xor_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -5003,9 +5003,9 @@ entry: define amdgpu_kernel void @atomic_xor_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_xor_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf -; SI-NEXT: s_load_dword s2, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf +; SI-NEXT: s_load_dword s2, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -5025,20 +5025,20 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_xor_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_xor v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -5047,12 +5047,12 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_xor_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -5071,7 +5071,7 @@ entry: define amdgpu_kernel void @atomic_load_i32_offset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -5087,7 +5087,7 @@ define amdgpu_kernel void @atomic_load_i32_offset(ptr addrspace(1) %in, ptr addr ; ; VI-LABEL: atomic_load_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5105,7 +5105,7 @@ define amdgpu_kernel void @atomic_load_i32_offset(ptr addrspace(1) %in, ptr addr ; ; GFX9-LABEL: atomic_load_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] offset:16 glc @@ -5123,7 +5123,7 @@ entry: define amdgpu_kernel void @atomic_load_i32_negoffset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_i32_negoffset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, s2 @@ -5141,7 +5141,7 @@ define amdgpu_kernel void @atomic_load_i32_negoffset(ptr addrspace(1) %in, ptr a ; ; VI-LABEL: atomic_load_i32_negoffset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5159,7 +5159,7 @@ define amdgpu_kernel void @atomic_load_i32_negoffset(ptr addrspace(1) %in, ptr a ; ; GFX9-LABEL: atomic_load_i32_negoffset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] offset:-512 glc @@ -5177,7 +5177,7 @@ entry: define amdgpu_kernel void @atomic_load_f32_offset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_f32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -5193,7 +5193,7 @@ define amdgpu_kernel void @atomic_load_f32_offset(ptr addrspace(1) %in, ptr addr ; ; VI-LABEL: atomic_load_f32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5211,7 +5211,7 @@ define amdgpu_kernel void @atomic_load_f32_offset(ptr addrspace(1) %in, ptr addr ; ; GFX9-LABEL: atomic_load_f32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] offset:16 glc @@ -5229,7 +5229,7 @@ entry: define amdgpu_kernel void @atomic_load_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -5245,7 +5245,7 @@ define amdgpu_kernel void @atomic_load_i32(ptr addrspace(1) %in, ptr addrspace(1 ; ; VI-LABEL: atomic_load_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5261,7 +5261,7 @@ define amdgpu_kernel void @atomic_load_i32(ptr addrspace(1) %in, ptr addrspace(1 ; ; GFX9-LABEL: atomic_load_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc @@ -5278,8 +5278,8 @@ entry: define amdgpu_kernel void @atomic_load_i32_addr64_offset(ptr addrspace(1) %in, ptr addrspace(1) %out, i64 %index) { ; SI-LABEL: atomic_load_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -5298,8 +5298,8 @@ define amdgpu_kernel void @atomic_load_i32_addr64_offset(ptr addrspace(1) %in, p ; ; VI-LABEL: atomic_load_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5320,11 +5320,11 @@ define amdgpu_kernel void @atomic_load_i32_addr64_offset(ptr addrspace(1) %in, p ; ; GFX9-LABEL: atomic_load_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] offset:16 glc @@ -5343,8 +5343,8 @@ entry: define amdgpu_kernel void @atomic_load_i32_addr64(ptr addrspace(1) %in, ptr addrspace(1) %out, i64 %index) { ; SI-LABEL: atomic_load_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -5363,8 +5363,8 @@ define amdgpu_kernel void @atomic_load_i32_addr64(ptr addrspace(1) %in, ptr addr ; ; VI-LABEL: atomic_load_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5383,11 +5383,11 @@ define amdgpu_kernel void @atomic_load_i32_addr64(ptr addrspace(1) %in, ptr addr ; ; GFX9-LABEL: atomic_load_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc @@ -5405,8 +5405,8 @@ entry: define amdgpu_kernel void @atomic_load_f32_addr64_offset(ptr addrspace(1) %in, ptr addrspace(1) %out, i64 %index) { ; SI-LABEL: atomic_load_f32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -5425,8 +5425,8 @@ define amdgpu_kernel void @atomic_load_f32_addr64_offset(ptr addrspace(1) %in, p ; ; VI-LABEL: atomic_load_f32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5447,11 +5447,11 @@ define amdgpu_kernel void @atomic_load_f32_addr64_offset(ptr addrspace(1) %in, p ; ; GFX9-LABEL: atomic_load_f32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] offset:16 glc @@ -5470,8 +5470,8 @@ entry: define amdgpu_kernel void @atomic_store_i32_offset(i32 %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_store_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SI-NEXT: s_load_dword s4, s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -5481,25 +5481,25 @@ define amdgpu_kernel void @atomic_store_i32_offset(i32 %in, ptr addrspace(1) %ou ; ; VI-LABEL: atomic_store_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_load_dword s4, s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; VI-NEXT: s_load_dword s2, s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s2, 16 -; VI-NEXT: s_addc_u32 s1, s3, 0 +; VI-NEXT: s_add_u32 s0, s0, 16 +; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_store_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_store_dword v0, v1, s[2:3] offset:16 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 @@ -5510,8 +5510,8 @@ entry: define amdgpu_kernel void @atomic_store_i32(i32 %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_store_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SI-NEXT: s_load_dword s4, s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -5521,23 +5521,23 @@ define amdgpu_kernel void @atomic_store_i32(i32 %in, ptr addrspace(1) %out) { ; ; VI-LABEL: atomic_store_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_load_dword s0, s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; VI-NEXT: s_load_dword s2, s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_store_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm entry: store atomic i32 %in, ptr addrspace(1) %out seq_cst, align 4 @@ -5547,8 +5547,8 @@ entry: define amdgpu_kernel void @atomic_store_f32(float %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_store_f32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SI-NEXT: s_load_dword s4, s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -5558,23 +5558,23 @@ define amdgpu_kernel void @atomic_store_f32(float %in, ptr addrspace(1) %out) { ; ; VI-LABEL: atomic_store_f32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_load_dword s0, s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; VI-NEXT: s_load_dword s2, s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_store_f32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm entry: store atomic float %in, ptr addrspace(1) %out seq_cst, align 4 @@ -5584,8 +5584,8 @@ entry: define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, ptr addrspace(1) %out, i64 %index) { ; SI-LABEL: atomic_store_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; SI-NEXT: s_load_dword s2, s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; SI-NEXT: s_load_dword s2, s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -5598,8 +5598,8 @@ define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, ptr addrspace ; ; VI-LABEL: atomic_store_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; VI-NEXT: s_load_dword s2, s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; VI-NEXT: s_load_dword s2, s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 @@ -5614,14 +5614,14 @@ define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, ptr addrspace ; ; GFX9-LABEL: atomic_store_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s8, s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_endpgm entry: @@ -5634,8 +5634,8 @@ entry: define amdgpu_kernel void @atomic_store_f32_addr64_offset(float %in, ptr addrspace(1) %out, i64 %index) { ; SI-LABEL: atomic_store_f32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; SI-NEXT: s_load_dword s2, s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; SI-NEXT: s_load_dword s2, s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -5648,8 +5648,8 @@ define amdgpu_kernel void @atomic_store_f32_addr64_offset(float %in, ptr addrspa ; ; VI-LABEL: atomic_store_f32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; VI-NEXT: s_load_dword s2, s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; VI-NEXT: s_load_dword s2, s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 @@ -5664,14 +5664,14 @@ define amdgpu_kernel void @atomic_store_f32_addr64_offset(float %in, ptr addrspa ; ; GFX9-LABEL: atomic_store_f32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s8, s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_endpgm entry: @@ -5684,8 +5684,8 @@ entry: define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, ptr addrspace(1) %out, i64 %index) { ; SI-LABEL: atomic_store_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; SI-NEXT: s_load_dword s8, s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; SI-NEXT: s_load_dword s8, s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], 2 @@ -5699,8 +5699,8 @@ define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, ptr addrspace(1) %ou ; ; VI-LABEL: atomic_store_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; VI-NEXT: s_load_dword s2, s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; VI-NEXT: s_load_dword s2, s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 @@ -5713,14 +5713,14 @@ define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_store_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s8, s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm entry: @@ -5732,8 +5732,8 @@ entry: define amdgpu_kernel void @atomic_store_f32_addr64(float %in, ptr addrspace(1) %out, i64 %index) { ; SI-LABEL: atomic_store_f32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; SI-NEXT: s_load_dword s8, s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; SI-NEXT: s_load_dword s8, s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], 2 @@ -5747,8 +5747,8 @@ define amdgpu_kernel void @atomic_store_f32_addr64(float %in, ptr addrspace(1) % ; ; VI-LABEL: atomic_store_f32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; VI-NEXT: s_load_dword s2, s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; VI-NEXT: s_load_dword s2, s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 @@ -5761,14 +5761,14 @@ define amdgpu_kernel void @atomic_store_f32_addr64(float %in, ptr addrspace(1) % ; ; GFX9-LABEL: atomic_store_f32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s8, s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm entry: @@ -5780,7 +5780,7 @@ entry: define amdgpu_kernel void @atomic_load_i8_offset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_i8_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -5796,7 +5796,7 @@ define amdgpu_kernel void @atomic_load_i8_offset(ptr addrspace(1) %in, ptr addrs ; ; VI-LABEL: atomic_load_i8_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5812,7 +5812,7 @@ define amdgpu_kernel void @atomic_load_i8_offset(ptr addrspace(1) %in, ptr addrs ; ; GFX9-LABEL: atomic_load_i8_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v1, v0, s[0:1] offset:16 glc @@ -5830,7 +5830,7 @@ entry: define amdgpu_kernel void @atomic_load_i8_negoffset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_i8_negoffset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, s2 @@ -5848,7 +5848,7 @@ define amdgpu_kernel void @atomic_load_i8_negoffset(ptr addrspace(1) %in, ptr ad ; ; VI-LABEL: atomic_load_i8_negoffset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5866,7 +5866,7 @@ define amdgpu_kernel void @atomic_load_i8_negoffset(ptr addrspace(1) %in, ptr ad ; ; GFX9-LABEL: atomic_load_i8_negoffset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v1, v0, s[0:1] offset:-512 glc @@ -5884,8 +5884,8 @@ entry: define amdgpu_kernel void @atomic_store_i8_offset(i8 %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_store_i8_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SI-NEXT: s_load_dword s4, s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -5895,25 +5895,25 @@ define amdgpu_kernel void @atomic_store_i8_offset(i8 %in, ptr addrspace(1) %out) ; ; VI-LABEL: atomic_store_i8_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_load_dword s4, s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; VI-NEXT: s_load_dword s2, s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s2, 16 -; VI-NEXT: s_addc_u32 s1, s3, 0 +; VI-NEXT: s_add_u32 s0, s0, 16 +; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_byte v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_store_i8_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_store_byte v0, v1, s[2:3] offset:16 +; GFX9-NEXT: global_store_byte v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_endpgm entry: %gep = getelementptr i8, ptr addrspace(1) %out, i64 16 @@ -5924,8 +5924,8 @@ entry: define amdgpu_kernel void @atomic_store_i8(i8 %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_store_i8: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SI-NEXT: s_load_dword s4, s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -5935,23 +5935,23 @@ define amdgpu_kernel void @atomic_store_i8(i8 %in, ptr addrspace(1) %out) { ; ; VI-LABEL: atomic_store_i8: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_load_dword s0, s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; VI-NEXT: s_load_dword s2, s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_byte v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_store_i8: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_store_byte v0, v1, s[2:3] +; GFX9-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm entry: store atomic i8 %in, ptr addrspace(1) %out seq_cst, align 1 @@ -5961,7 +5961,7 @@ entry: define amdgpu_kernel void @atomic_load_i16_offset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_i16_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -5977,7 +5977,7 @@ define amdgpu_kernel void @atomic_load_i16_offset(ptr addrspace(1) %in, ptr addr ; ; VI-LABEL: atomic_load_i16_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5993,7 +5993,7 @@ define amdgpu_kernel void @atomic_load_i16_offset(ptr addrspace(1) %in, ptr addr ; ; GFX9-LABEL: atomic_load_i16_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] offset:16 glc @@ -6011,7 +6011,7 @@ entry: define amdgpu_kernel void @atomic_load_i16_negoffset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_i16_negoffset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, s2 @@ -6029,7 +6029,7 @@ define amdgpu_kernel void @atomic_load_i16_negoffset(ptr addrspace(1) %in, ptr a ; ; VI-LABEL: atomic_load_i16_negoffset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -6047,7 +6047,7 @@ define amdgpu_kernel void @atomic_load_i16_negoffset(ptr addrspace(1) %in, ptr a ; ; GFX9-LABEL: atomic_load_i16_negoffset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] offset:-512 glc @@ -6065,8 +6065,8 @@ entry: define amdgpu_kernel void @atomic_store_i16_offset(i16 %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_store_i16_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SI-NEXT: s_load_dword s4, s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -6076,25 +6076,25 @@ define amdgpu_kernel void @atomic_store_i16_offset(i16 %in, ptr addrspace(1) %ou ; ; VI-LABEL: atomic_store_i16_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_load_dword s4, s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; VI-NEXT: s_load_dword s2, s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s2, 16 -; VI-NEXT: s_addc_u32 s1, s3, 0 +; VI-NEXT: s_add_u32 s0, s0, 16 +; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_store_i16_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_store_short v0, v1, s[2:3] offset:16 +; GFX9-NEXT: global_store_short v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_endpgm entry: %gep = getelementptr i16, ptr addrspace(1) %out, i64 8 @@ -6105,8 +6105,8 @@ entry: define amdgpu_kernel void @atomic_store_i16(i16 %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_store_i16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SI-NEXT: s_load_dword s4, s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -6116,23 +6116,23 @@ define amdgpu_kernel void @atomic_store_i16(i16 %in, ptr addrspace(1) %out) { ; ; VI-LABEL: atomic_store_i16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_load_dword s0, s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; VI-NEXT: s_load_dword s2, s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_store_i16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_store_short v0, v1, s[2:3] +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm entry: store atomic i16 %in, ptr addrspace(1) %out seq_cst, align 2 @@ -6142,8 +6142,8 @@ entry: define amdgpu_kernel void @atomic_store_f16_offset(half %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_store_f16_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SI-NEXT: s_load_dword s4, s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -6153,25 +6153,25 @@ define amdgpu_kernel void @atomic_store_f16_offset(half %in, ptr addrspace(1) %o ; ; VI-LABEL: atomic_store_f16_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_load_dword s4, s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; VI-NEXT: s_load_dword s2, s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s2, 16 -; VI-NEXT: s_addc_u32 s1, s3, 0 +; VI-NEXT: s_add_u32 s0, s0, 16 +; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_store_f16_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_store_short v0, v1, s[2:3] offset:16 +; GFX9-NEXT: global_store_short v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_endpgm entry: %gep = getelementptr half, ptr addrspace(1) %out, i64 8 @@ -6182,8 +6182,8 @@ entry: define amdgpu_kernel void @atomic_store_f16(half %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_store_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SI-NEXT: s_load_dword s4, s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -6193,23 +6193,23 @@ define amdgpu_kernel void @atomic_store_f16(half %in, ptr addrspace(1) %out) { ; ; VI-LABEL: atomic_store_f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_load_dword s0, s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; VI-NEXT: s_load_dword s2, s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_store_f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_store_short v0, v1, s[2:3] +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm entry: store atomic half %in, ptr addrspace(1) %out seq_cst, align 2 @@ -6219,8 +6219,8 @@ entry: define amdgpu_kernel void @atomic_store_bf16_offset(bfloat %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_store_bf16_offset: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SI-NEXT: s_load_dword s4, s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -6230,25 +6230,25 @@ define amdgpu_kernel void @atomic_store_bf16_offset(bfloat %in, ptr addrspace(1) ; ; VI-LABEL: atomic_store_bf16_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_load_dword s4, s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; VI-NEXT: s_load_dword s2, s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s2, 16 -; VI-NEXT: s_addc_u32 s1, s3, 0 +; VI-NEXT: s_add_u32 s0, s0, 16 +; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_store_bf16_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_store_short v0, v1, s[2:3] offset:16 +; GFX9-NEXT: global_store_short v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_endpgm %gep = getelementptr bfloat, ptr addrspace(1) %out, i64 8 store atomic bfloat %in, ptr addrspace(1) %gep seq_cst, align 2 @@ -6258,8 +6258,8 @@ define amdgpu_kernel void @atomic_store_bf16_offset(bfloat %in, ptr addrspace(1) define amdgpu_kernel void @atomic_store_bf16(bfloat %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_store_bf16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SI-NEXT: s_load_dword s4, s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -6269,23 +6269,23 @@ define amdgpu_kernel void @atomic_store_bf16(bfloat %in, ptr addrspace(1) %out) ; ; VI-LABEL: atomic_store_bf16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_load_dword s0, s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; VI-NEXT: s_load_dword s2, s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_store_bf16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_store_short v0, v1, s[2:3] +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm store atomic bfloat %in, ptr addrspace(1) %out seq_cst, align 2 ret void @@ -6294,8 +6294,8 @@ define amdgpu_kernel void @atomic_store_bf16(bfloat %in, ptr addrspace(1) %out) define amdgpu_kernel void @atomic_inc_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_inc_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -6307,8 +6307,8 @@ define amdgpu_kernel void @atomic_inc_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; VI-LABEL: atomic_inc_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -6320,12 +6320,12 @@ define amdgpu_kernel void @atomic_inc_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; GFX9-LABEL: atomic_inc_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_inc v0, v1, s[2:3] offset:16 +; GFX9-NEXT: global_atomic_inc v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -6338,8 +6338,8 @@ entry: define amdgpu_kernel void @atomic_inc_i32_max_neg_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_inc_i32_max_neg_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_mov_b32_e32 v0, 0xfffff000 @@ -6353,14 +6353,14 @@ define amdgpu_kernel void @atomic_inc_i32_max_neg_offset(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_inc_i32_max_neg_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s2, 0xfffff000 -; VI-NEXT: s_addc_u32 s1, s3, -1 +; VI-NEXT: s_add_u32 s0, s0, 0xfffff000 +; VI-NEXT: s_addc_u32 s1, s1, -1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_inc v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -6368,12 +6368,12 @@ define amdgpu_kernel void @atomic_inc_i32_max_neg_offset(ptr addrspace(1) %out, ; ; GFX9-LABEL: atomic_inc_i32_max_neg_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_inc v0, v1, s[2:3] offset:-4096 +; GFX9-NEXT: global_atomic_inc v0, v1, s[0:1] offset:-4096 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -6386,8 +6386,8 @@ entry: define amdgpu_kernel void @atomic_inc_i32_soffset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_inc_i32_soffset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s5, 0x8ca0 @@ -6400,8 +6400,8 @@ define amdgpu_kernel void @atomic_inc_i32_soffset(ptr addrspace(1) %out, i32 %in ; ; VI-LABEL: atomic_inc_i32_soffset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s5, 0x8ca0 @@ -6414,12 +6414,12 @@ define amdgpu_kernel void @atomic_inc_i32_soffset(ptr addrspace(1) %out, i32 %in ; ; GFX9-LABEL: atomic_inc_i32_soffset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x8000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_inc v0, v1, s[2:3] offset:3232 +; GFX9-NEXT: global_atomic_inc v0, v1, s[0:1] offset:3232 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -6432,29 +6432,29 @@ entry: define amdgpu_kernel void @atomic_inc_i32_huge_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_inc_i32_huge_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_mov_b32_e32 v0, 0xdeac ; SI-NEXT: v_mov_b32_e32 v1, 0xabcd ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s0 -; SI-NEXT: buffer_atomic_inc v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: buffer_atomic_inc v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_inc_i32_huge_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s2, 0xdeac -; VI-NEXT: s_addc_u32 s1, s3, 0xabcd +; VI-NEXT: s_add_u32 s0, s0, 0xdeac +; VI-NEXT: s_addc_u32 s1, s1, 0xabcd ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_inc v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -6462,12 +6462,12 @@ define amdgpu_kernel void @atomic_inc_i32_huge_offset(ptr addrspace(1) %out, i32 ; ; GFX9-LABEL: atomic_inc_i32_huge_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_u32 s0, s2, 0xdeac -; GFX9-NEXT: s_addc_u32 s1, s3, 0xabcd +; GFX9-NEXT: s_add_u32 s0, s0, 0xdeac +; GFX9-NEXT: s_addc_u32 s1, s1, 0xabcd ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: global_atomic_inc v0, v1, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -6482,8 +6482,8 @@ entry: define amdgpu_kernel void @atomic_inc_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_inc_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dword s8, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -6500,29 +6500,29 @@ define amdgpu_kernel void @atomic_inc_i32_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_inc_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: s_mov_b32 s8, s6 +; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: buffer_atomic_inc v0, off, s[4:7], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_inc_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_atomic_inc v1, v0, v1, s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -6538,9 +6538,9 @@ entry: define amdgpu_kernel void @atomic_inc_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_inc_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_load_dword s6, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dword s6, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -6555,18 +6555,18 @@ define amdgpu_kernel void @atomic_inc_i32_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_inc_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s6, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_inc v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -6574,12 +6574,12 @@ define amdgpu_kernel void @atomic_inc_i32_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_inc_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -6597,9 +6597,9 @@ entry: define amdgpu_kernel void @atomic_inc_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_inc_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf -; SI-NEXT: s_load_dword s2, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf +; SI-NEXT: s_load_dword s2, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -6619,22 +6619,22 @@ define amdgpu_kernel void @atomic_inc_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_inc_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -6643,12 +6643,12 @@ define amdgpu_kernel void @atomic_inc_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_inc_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -6668,8 +6668,8 @@ entry: define amdgpu_kernel void @atomic_dec_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_dec_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -6681,8 +6681,8 @@ define amdgpu_kernel void @atomic_dec_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; VI-LABEL: atomic_dec_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -6694,12 +6694,12 @@ define amdgpu_kernel void @atomic_dec_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; GFX9-LABEL: atomic_dec_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_dec v0, v1, s[2:3] offset:16 +; GFX9-NEXT: global_atomic_dec v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -6712,8 +6712,8 @@ entry: define amdgpu_kernel void @atomic_dec_i32_max_neg_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_dec_i32_max_neg_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_mov_b32_e32 v0, 0xfffff000 @@ -6727,14 +6727,14 @@ define amdgpu_kernel void @atomic_dec_i32_max_neg_offset(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_dec_i32_max_neg_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s2, 0xfffff000 -; VI-NEXT: s_addc_u32 s1, s3, -1 +; VI-NEXT: s_add_u32 s0, s0, 0xfffff000 +; VI-NEXT: s_addc_u32 s1, s1, -1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_dec v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -6742,12 +6742,12 @@ define amdgpu_kernel void @atomic_dec_i32_max_neg_offset(ptr addrspace(1) %out, ; ; GFX9-LABEL: atomic_dec_i32_max_neg_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_dec v0, v1, s[2:3] offset:-4096 +; GFX9-NEXT: global_atomic_dec v0, v1, s[0:1] offset:-4096 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -6760,8 +6760,8 @@ entry: define amdgpu_kernel void @atomic_dec_i32_soffset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_dec_i32_soffset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s5, 0x8ca0 @@ -6774,8 +6774,8 @@ define amdgpu_kernel void @atomic_dec_i32_soffset(ptr addrspace(1) %out, i32 %in ; ; VI-LABEL: atomic_dec_i32_soffset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s5, 0x8ca0 @@ -6788,12 +6788,12 @@ define amdgpu_kernel void @atomic_dec_i32_soffset(ptr addrspace(1) %out, i32 %in ; ; GFX9-LABEL: atomic_dec_i32_soffset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x8000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_dec v0, v1, s[2:3] offset:3232 +; GFX9-NEXT: global_atomic_dec v0, v1, s[0:1] offset:3232 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -6806,29 +6806,29 @@ entry: define amdgpu_kernel void @atomic_dec_i32_huge_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_dec_i32_huge_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_mov_b32_e32 v0, 0xdeac ; SI-NEXT: v_mov_b32_e32 v1, 0xabcd ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s0 -; SI-NEXT: buffer_atomic_dec v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: buffer_atomic_dec v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_dec_i32_huge_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s2, 0xdeac -; VI-NEXT: s_addc_u32 s1, s3, 0xabcd +; VI-NEXT: s_add_u32 s0, s0, 0xdeac +; VI-NEXT: s_addc_u32 s1, s1, 0xabcd ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_dec v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -6836,12 +6836,12 @@ define amdgpu_kernel void @atomic_dec_i32_huge_offset(ptr addrspace(1) %out, i32 ; ; GFX9-LABEL: atomic_dec_i32_huge_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_u32 s0, s2, 0xdeac -; GFX9-NEXT: s_addc_u32 s1, s3, 0xabcd +; GFX9-NEXT: s_add_u32 s0, s0, 0xdeac +; GFX9-NEXT: s_addc_u32 s1, s1, 0xabcd ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: global_atomic_dec v0, v1, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -6856,8 +6856,8 @@ entry: define amdgpu_kernel void @atomic_dec_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_dec_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dword s8, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -6874,29 +6874,29 @@ define amdgpu_kernel void @atomic_dec_i32_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_dec_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: s_mov_b32 s8, s6 +; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: buffer_atomic_dec v0, off, s[4:7], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_dec_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_atomic_dec v1, v0, v1, s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -6912,9 +6912,9 @@ entry: define amdgpu_kernel void @atomic_dec_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_dec_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_load_dword s6, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dword s6, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -6929,18 +6929,18 @@ define amdgpu_kernel void @atomic_dec_i32_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_dec_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s6, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_dec v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -6948,12 +6948,12 @@ define amdgpu_kernel void @atomic_dec_i32_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_dec_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -6971,9 +6971,9 @@ entry: define amdgpu_kernel void @atomic_dec_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_dec_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf -; SI-NEXT: s_load_dword s2, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf +; SI-NEXT: s_load_dword s2, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -6993,22 +6993,22 @@ define amdgpu_kernel void @atomic_dec_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_dec_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -7017,12 +7017,12 @@ define amdgpu_kernel void @atomic_dec_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_dec_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -7042,7 +7042,7 @@ entry: define amdgpu_kernel void @atomic_load_f16_offset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_f16_offset: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -7058,7 +7058,7 @@ define amdgpu_kernel void @atomic_load_f16_offset(ptr addrspace(1) %in, ptr addr ; ; VI-LABEL: atomic_load_f16_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -7074,7 +7074,7 @@ define amdgpu_kernel void @atomic_load_f16_offset(ptr addrspace(1) %in, ptr addr ; ; GFX9-LABEL: atomic_load_f16_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] offset:16 glc @@ -7091,7 +7091,7 @@ define amdgpu_kernel void @atomic_load_f16_offset(ptr addrspace(1) %in, ptr addr define amdgpu_kernel void @atomic_load_f16_negoffset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_f16_negoffset: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, s2 @@ -7109,7 +7109,7 @@ define amdgpu_kernel void @atomic_load_f16_negoffset(ptr addrspace(1) %in, ptr a ; ; VI-LABEL: atomic_load_f16_negoffset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -7127,7 +7127,7 @@ define amdgpu_kernel void @atomic_load_f16_negoffset(ptr addrspace(1) %in, ptr a ; ; GFX9-LABEL: atomic_load_f16_negoffset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] offset:-512 glc @@ -7144,7 +7144,7 @@ define amdgpu_kernel void @atomic_load_f16_negoffset(ptr addrspace(1) %in, ptr a define amdgpu_kernel void @atomic_load_bf16_offset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_bf16_offset: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -7160,7 +7160,7 @@ define amdgpu_kernel void @atomic_load_bf16_offset(ptr addrspace(1) %in, ptr add ; ; VI-LABEL: atomic_load_bf16_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -7176,7 +7176,7 @@ define amdgpu_kernel void @atomic_load_bf16_offset(ptr addrspace(1) %in, ptr add ; ; GFX9-LABEL: atomic_load_bf16_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] offset:16 glc @@ -7193,7 +7193,7 @@ define amdgpu_kernel void @atomic_load_bf16_offset(ptr addrspace(1) %in, ptr add define amdgpu_kernel void @atomic_load_bf16_negoffset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_bf16_negoffset: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, s2 @@ -7211,7 +7211,7 @@ define amdgpu_kernel void @atomic_load_bf16_negoffset(ptr addrspace(1) %in, ptr ; ; VI-LABEL: atomic_load_bf16_negoffset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -7229,7 +7229,7 @@ define amdgpu_kernel void @atomic_load_bf16_negoffset(ptr addrspace(1) %in, ptr ; ; GFX9-LABEL: atomic_load_bf16_negoffset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] offset:-512 glc diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll index 1fa7c52a68802..c7fa2a2ede388 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll @@ -4616,7 +4616,7 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_offset_scalar(ptr addrspace(1) define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i32 %index) { ; SI-LABEL: atomic_max_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_ashr_i32 s5, s3, 31 ; SI-NEXT: s_mov_b32 s4, s3 @@ -4648,7 +4648,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_max_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s5, s3, 31 ; VI-NEXT: s_mov_b32 s4, s3 @@ -4679,7 +4679,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_max_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -4714,8 +4714,8 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %index) { ; SI-LABEL: atomic_max_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_ashr_i32 s5, s9, 31 ; SI-NEXT: s_mov_b32 s4, s9 @@ -4753,8 +4753,8 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_max_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s7, s5, 31 ; VI-NEXT: s_mov_b32 s6, s5 @@ -4789,24 +4789,24 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_max_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s1, s3, 31 -; GFX9-NEXT: s_mov_b32 s0, s3 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 -; GFX9-NEXT: s_load_dword s3, s[0:1], 0x10 +; GFX9-NEXT: s_ashr_i32 s3, s1, 31 +; GFX9-NEXT: s_mov_b32 s2, s1 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; GFX9-NEXT: s_add_u32 s2, s4, s2 +; GFX9-NEXT: s_addc_u32 s3, s5, s3 +; GFX9-NEXT: s_load_dword s1, s[2:3], 0x10 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: .LBB92_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: v_max_i32_e32 v2, s2, v3 -; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] offset:16 glc +; GFX9-NEXT: v_max_i32_e32 v2, s0, v3 +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[2:3] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 @@ -4829,7 +4829,7 @@ entry: define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, i32 %index) { ; SI-LABEL: atomic_max_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_ashr_i32 s5, s3, 31 ; SI-NEXT: s_mov_b32 s4, s3 @@ -4861,7 +4861,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; VI-LABEL: atomic_max_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s5, s3, 31 ; VI-NEXT: s_mov_b32 s4, s3 @@ -4890,7 +4890,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; GFX9-LABEL: atomic_max_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -4924,8 +4924,8 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %index) { ; SI-LABEL: atomic_max_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_ashr_i32 s5, s9, 31 ; SI-NEXT: s_mov_b32 s4, s9 @@ -4963,8 +4963,8 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_max_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s7, s5, 31 ; VI-NEXT: s_mov_b32 s6, s5 @@ -4997,24 +4997,24 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_max_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s1, s3, 31 -; GFX9-NEXT: s_mov_b32 s0, s3 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 -; GFX9-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX9-NEXT: s_ashr_i32 s3, s1, 31 +; GFX9-NEXT: s_mov_b32 s2, s1 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; GFX9-NEXT: s_add_u32 s2, s4, s2 +; GFX9-NEXT: s_addc_u32 s3, s5, s3 +; GFX9-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: .LBB94_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: v_max_i32_e32 v2, s2, v3 -; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc +; GFX9-NEXT: v_max_i32_e32 v2, s0, v3 +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 @@ -5869,7 +5869,7 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_offset_scalar(ptr addrspace(1) define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i32 %index) { ; SI-LABEL: atomic_umax_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_ashr_i32 s5, s3, 31 ; SI-NEXT: s_mov_b32 s4, s3 @@ -5901,7 +5901,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_umax_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s5, s3, 31 ; VI-NEXT: s_mov_b32 s4, s3 @@ -5932,7 +5932,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, ; ; GFX9-LABEL: atomic_umax_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -5967,8 +5967,8 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %index) { ; SI-LABEL: atomic_umax_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_ashr_i32 s5, s9, 31 ; SI-NEXT: s_mov_b32 s4, s9 @@ -6006,8 +6006,8 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o ; ; VI-LABEL: atomic_umax_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s7, s5, 31 ; VI-NEXT: s_mov_b32 s6, s5 @@ -6042,24 +6042,24 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o ; ; GFX9-LABEL: atomic_umax_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s1, s3, 31 -; GFX9-NEXT: s_mov_b32 s0, s3 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 -; GFX9-NEXT: s_load_dword s3, s[0:1], 0x10 +; GFX9-NEXT: s_ashr_i32 s3, s1, 31 +; GFX9-NEXT: s_mov_b32 s2, s1 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; GFX9-NEXT: s_add_u32 s2, s4, s2 +; GFX9-NEXT: s_addc_u32 s3, s5, s3 +; GFX9-NEXT: s_load_dword s1, s[2:3], 0x10 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: .LBB106_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: v_max_u32_e32 v2, s2, v3 -; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] offset:16 glc +; GFX9-NEXT: v_max_u32_e32 v2, s0, v3 +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[2:3] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 @@ -6082,8 +6082,8 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %index) { ; SI-LABEL: atomic_umax_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_ashr_i32 s5, s9, 31 ; SI-NEXT: s_mov_b32 s4, s9 @@ -6121,8 +6121,8 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_umax_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s7, s5, 31 ; VI-NEXT: s_mov_b32 s6, s5 @@ -6155,24 +6155,24 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_umax_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s1, s3, 31 -; GFX9-NEXT: s_mov_b32 s0, s3 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 -; GFX9-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX9-NEXT: s_ashr_i32 s3, s1, 31 +; GFX9-NEXT: s_mov_b32 s2, s1 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; GFX9-NEXT: s_add_u32 s2, s4, s2 +; GFX9-NEXT: s_addc_u32 s3, s5, s3 +; GFX9-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: .LBB107_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: v_max_u32_e32 v2, s2, v3 -; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc +; GFX9-NEXT: v_max_u32_e32 v2, s0, v3 +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 @@ -7860,7 +7860,7 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_offset_scalar(ptr addrspace(1) define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i32 %index) { ; SI-LABEL: atomic_min_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_ashr_i32 s5, s3, 31 ; SI-NEXT: s_mov_b32 s4, s3 @@ -7892,7 +7892,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_min_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s5, s3, 31 ; VI-NEXT: s_mov_b32 s4, s3 @@ -7923,7 +7923,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_min_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -7958,8 +7958,8 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %index) { ; SI-LABEL: atomic_min_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_ashr_i32 s5, s9, 31 ; SI-NEXT: s_mov_b32 s4, s9 @@ -7997,8 +7997,8 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_min_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s7, s5, 31 ; VI-NEXT: s_mov_b32 s6, s5 @@ -8033,24 +8033,24 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_min_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s1, s3, 31 -; GFX9-NEXT: s_mov_b32 s0, s3 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 -; GFX9-NEXT: s_load_dword s3, s[0:1], 0x10 +; GFX9-NEXT: s_ashr_i32 s3, s1, 31 +; GFX9-NEXT: s_mov_b32 s2, s1 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; GFX9-NEXT: s_add_u32 s2, s4, s2 +; GFX9-NEXT: s_addc_u32 s3, s5, s3 +; GFX9-NEXT: s_load_dword s1, s[2:3], 0x10 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: .LBB129_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: v_min_i32_e32 v2, s2, v3 -; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] offset:16 glc +; GFX9-NEXT: v_min_i32_e32 v2, s0, v3 +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[2:3] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 @@ -8073,36 +8073,36 @@ entry: define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_min_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[2:3], 0xb ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dword s3, s[4:5], 0x0 -; SI-NEXT: s_mov_b64 s[0:1], 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dword s2, s[0:1], 0x0 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v1, s2 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: .LBB130_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_min_i32_e32 v0, s2, v1 +; SI-NEXT: v_min_i32_e32 v0, s6, v1 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v3, v1 ; SI-NEXT: v_mov_b32_e32 v2, v0 -; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; SI-NEXT: v_mov_b32_e32 v1, v2 -; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; SI-NEXT: s_andn2_b64 exec, exec, s[4:5] ; SI-NEXT: s_cbranch_execnz .LBB130_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_min_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_mov_b64 s[0:1], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s3, s[4:5], 0x0 @@ -8126,24 +8126,24 @@ define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: atomic_min_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0 +; GFX9-NEXT: s_load_dword s5, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: .LBB130_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_min_i32_e32 v0, s4, v1 -; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB130_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm @@ -8155,8 +8155,8 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %index) { ; SI-LABEL: atomic_min_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_ashr_i32 s5, s9, 31 ; SI-NEXT: s_mov_b32 s4, s9 @@ -8194,8 +8194,8 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_min_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s7, s5, 31 ; VI-NEXT: s_mov_b32 s6, s5 @@ -8228,24 +8228,24 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_min_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s1, s3, 31 -; GFX9-NEXT: s_mov_b32 s0, s3 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 -; GFX9-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX9-NEXT: s_ashr_i32 s3, s1, 31 +; GFX9-NEXT: s_mov_b32 s2, s1 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; GFX9-NEXT: s_add_u32 s2, s4, s2 +; GFX9-NEXT: s_addc_u32 s3, s5, s3 +; GFX9-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: .LBB131_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: v_min_i32_e32 v2, s2, v3 -; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc +; GFX9-NEXT: v_min_i32_e32 v2, s0, v3 +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll index e2d55990473c0..3735c6349fbb3 100644 --- a/llvm/test/CodeGen/AMDGPU/half.ll +++ b/llvm/test/CodeGen/AMDGPU/half.ll @@ -8,8 +8,8 @@ define amdgpu_kernel void @load_f16_arg(ptr addrspace(1) %out, half %arg) #0 { ; CI-LABEL: load_f16_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[6:7], 0x2 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -19,8 +19,8 @@ define amdgpu_kernel void @load_f16_arg(ptr addrspace(1) %out, half %arg) #0 { ; ; VI-LABEL: load_f16_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -31,10 +31,10 @@ define amdgpu_kernel void @load_f16_arg(ptr addrspace(1) %out, half %arg) #0 { ; GFX11-LABEL: load_f16_arg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -46,8 +46,8 @@ define amdgpu_kernel void @load_f16_arg(ptr addrspace(1) %out, half %arg) #0 { define amdgpu_kernel void @load_v2f16_arg(ptr addrspace(1) %out, <2 x half> %arg) #0 { ; CI-LABEL: load_v2f16_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[6:7], 0x2 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -57,8 +57,8 @@ define amdgpu_kernel void @load_v2f16_arg(ptr addrspace(1) %out, <2 x half> %arg ; ; VI-LABEL: load_v2f16_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -69,10 +69,10 @@ define amdgpu_kernel void @load_v2f16_arg(ptr addrspace(1) %out, <2 x half> %arg ; GFX11-LABEL: load_v2f16_arg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -84,7 +84,7 @@ define amdgpu_kernel void @load_v2f16_arg(ptr addrspace(1) %out, <2 x half> %arg define amdgpu_kernel void @load_v3f16_arg(ptr addrspace(1) %out, <3 x half> %arg) #0 { ; CIVI-LABEL: load_v3f16_arg: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: s_add_u32 s4, s0, 4 ; CIVI-NEXT: s_addc_u32 s5, s1, 0 @@ -100,7 +100,7 @@ define amdgpu_kernel void @load_v3f16_arg(ptr addrspace(1) %out, <3 x half> %arg ; ; GFX11-LABEL: load_v3f16_arg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: v_mov_b32_e32 v2, s2 @@ -119,7 +119,7 @@ define amdgpu_kernel void @load_v3f16_arg(ptr addrspace(1) %out, <3 x half> %arg define amdgpu_kernel void @load_v4f16_arg(ptr addrspace(1) %out, <4 x half> %arg) #0 { ; CIVI-LABEL: load_v4f16_arg: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s0 ; CIVI-NEXT: v_mov_b32_e32 v2, s2 @@ -130,7 +130,7 @@ define amdgpu_kernel void @load_v4f16_arg(ptr addrspace(1) %out, <4 x half> %arg ; ; GFX11-LABEL: load_v4f16_arg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -145,12 +145,12 @@ define amdgpu_kernel void @load_v4f16_arg(ptr addrspace(1) %out, <4 x half> %arg define amdgpu_kernel void @load_v8f16_arg(ptr addrspace(1) %out, <8 x half> %arg) #0 { ; CI-LABEL: load_v8f16_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 +; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v4, s6 +; CI-NEXT: v_mov_b32_e32 v4, s4 ; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v5, s7 +; CI-NEXT: v_mov_b32_e32 v5, s5 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: v_mov_b32_e32 v3, s3 @@ -159,12 +159,12 @@ define amdgpu_kernel void @load_v8f16_arg(ptr addrspace(1) %out, <8 x half> %arg ; ; VI-LABEL: load_v8f16_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, s6 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 @@ -174,8 +174,8 @@ define amdgpu_kernel void @load_v8f16_arg(ptr addrspace(1) %out, <8 x half> %arg ; GFX11-LABEL: load_v8f16_arg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 @@ -191,8 +191,8 @@ define amdgpu_kernel void @load_v8f16_arg(ptr addrspace(1) %out, <8 x half> %arg define amdgpu_kernel void @extload_v2f16_arg(ptr addrspace(1) %out, <2 x half> %in) #0 { ; CI-LABEL: extload_v2f16_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[4:5], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dword s2, s[6:7], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s3, s2, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s3 @@ -204,8 +204,8 @@ define amdgpu_kernel void @extload_v2f16_arg(ptr addrspace(1) %out, <2 x half> % ; ; VI-LABEL: extload_v2f16_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v1, s3 @@ -218,13 +218,13 @@ define amdgpu_kernel void @extload_v2f16_arg(ptr addrspace(1) %out, <2 x half> % ; GFX11-LABEL: extload_v2f16_arg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s3, s2, 16 -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s2 -; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s3 +; GFX11-NEXT: s_lshr_b32 s2, s4, 16 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s4 +; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -237,8 +237,8 @@ define amdgpu_kernel void @extload_v2f16_arg(ptr addrspace(1) %out, <2 x half> % define amdgpu_kernel void @extload_f16_to_f32_arg(ptr addrspace(1) %out, half %arg) #0 { ; CI-LABEL: extload_f16_to_f32_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[4:5], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dword s2, s[6:7], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v2, s2 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -248,8 +248,8 @@ define amdgpu_kernel void @extload_f16_to_f32_arg(ptr addrspace(1) %out, half %a ; ; VI-LABEL: extload_f16_to_f32_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f32_f16_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -260,11 +260,11 @@ define amdgpu_kernel void @extload_f16_to_f32_arg(ptr addrspace(1) %out, half %a ; GFX11-LABEL: extload_f16_to_f32_arg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s2 +; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s4 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -277,8 +277,8 @@ define amdgpu_kernel void @extload_f16_to_f32_arg(ptr addrspace(1) %out, half %a define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(ptr addrspace(1) %out, <2 x half> %arg) #0 { ; CI-LABEL: extload_v2f16_to_v2f32_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[4:5], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dword s2, s[6:7], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s3, s2, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s3 @@ -290,8 +290,8 @@ define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(ptr addrspace(1) %out, <2 ; ; VI-LABEL: extload_v2f16_to_v2f32_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v1, s3 @@ -304,13 +304,13 @@ define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(ptr addrspace(1) %out, <2 ; GFX11-LABEL: extload_v2f16_to_v2f32_arg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s3, s2, 16 -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s2 -; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s3 +; GFX11-NEXT: s_lshr_b32 s2, s4, 16 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s4 +; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -323,7 +323,7 @@ define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(ptr addrspace(1) %out, <2 define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3 x half> %arg) #0 { ; CI-LABEL: extload_v3f16_to_v3f32_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s4, s2, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v2, s3 @@ -336,7 +336,7 @@ define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3 ; ; VI-LABEL: extload_v3f16_to_v3f32_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s4, s2, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s2 @@ -349,7 +349,7 @@ define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3 ; ; GFX11-LABEL: extload_v3f16_to_v3f32_arg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshr_b32 s4, s2, 16 @@ -368,7 +368,7 @@ define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3 define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4 x half> %arg) #0 { ; CI-LABEL: extload_v4f16_to_v4f32_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s4, s3, 16 ; CI-NEXT: s_lshr_b32 s5, s2, 16 @@ -383,7 +383,7 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4 ; ; VI-LABEL: extload_v4f16_to_v4f32_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s4, s3, 16 ; VI-NEXT: s_lshr_b32 s5, s2, 16 @@ -398,7 +398,7 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4 ; ; GFX11-LABEL: extload_v4f16_to_v4f32_arg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshr_b32 s4, s3, 16 @@ -419,8 +419,8 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4 define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(ptr addrspace(1) %out, <8 x half> %arg) #0 { ; CI-LABEL: extload_v8f16_to_v8f32_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 -; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s6, s1, 16 ; CI-NEXT: s_lshr_b32 s7, s0, 16 @@ -447,8 +447,8 @@ define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(ptr addrspace(1) %out, <8 ; ; VI-LABEL: extload_v8f16_to_v8f32_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s6, s1, 16 ; VI-NEXT: s_lshr_b32 s7, s0, 16 @@ -476,8 +476,8 @@ define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(ptr addrspace(1) %out, <8 ; GFX11-LABEL: extload_v8f16_to_v8f32_arg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v8, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshr_b32 s8, s7, 16 @@ -506,10 +506,10 @@ define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(ptr addrspace(1) %out, <8 define amdgpu_kernel void @extload_f16_to_f64_arg(ptr addrspace(1) %out, half %arg) #0 { ; CI-LABEL: extload_f16_to_f64_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[4:5], 0x2 +; CI-NEXT: s_load_dword s0, s[6:7], 0x2 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v3, s1 @@ -519,10 +519,10 @@ define amdgpu_kernel void @extload_f16_to_f64_arg(ptr addrspace(1) %out, half %a ; ; VI-LABEL: extload_f16_to_f64_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[4:5], 0x8 +; VI-NEXT: s_load_dword s0, s[6:7], 0x8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s1 @@ -532,14 +532,14 @@ define amdgpu_kernel void @extload_f16_to_f64_arg(ptr addrspace(1) %out, half %a ; ; GFX11-LABEL: extload_f16_to_f64_arg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x8 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s2 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -552,12 +552,12 @@ define amdgpu_kernel void @extload_f16_to_f64_arg(ptr addrspace(1) %out, half %a define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(ptr addrspace(1) %out, <2 x half> %arg) #0 { ; CI-LABEL: extload_v2f16_to_v2f64_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[4:5], 0x2 +; CI-NEXT: s_load_dword s0, s[6:7], 0x2 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s1, s0, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s1 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v0 ; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -568,12 +568,12 @@ define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(ptr addrspace(1) %out, <2 ; ; VI-LABEL: extload_v2f16_to_v2f64_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[4:5], 0x8 +; VI-NEXT: s_load_dword s0, s[6:7], 0x8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s1, s0, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s1 ; VI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v0 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -584,17 +584,17 @@ define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(ptr addrspace(1) %out, <2 ; ; GFX11-LABEL: extload_v2f16_to_v2f64_arg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x8 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s3, s2, 16 -; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s2 -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s3 +; GFX11-NEXT: s_lshr_b32 s1, s0, 16 +; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s1 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v0 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -607,7 +607,7 @@ define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(ptr addrspace(1) %out, <2 define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3 x half> %arg) #0 { ; CI-LABEL: extload_v3f16_to_v3f64_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v0, s3 ; CI-NEXT: s_lshr_b32 s4, s2, 16 @@ -628,7 +628,7 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3 ; ; VI-LABEL: extload_v3f16_to_v3f64_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f32_f16_e32 v1, s3 ; VI-NEXT: s_lshr_b32 s4, s2, 16 @@ -649,7 +649,7 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3 ; ; GFX11-LABEL: extload_v3f16_to_v3f64_arg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshr_b32 s4, s2, 16 ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s3 @@ -675,7 +675,7 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3 define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4 x half> %arg) #0 { ; CI-LABEL: extload_v4f16_to_v4f64_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s4, s3, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s3 @@ -700,7 +700,7 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4 ; ; VI-LABEL: extload_v4f16_to_v4f64_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s5, s3, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s3 @@ -725,7 +725,7 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4 ; ; GFX11-LABEL: extload_v4f16_to_v4f64_arg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshr_b32 s5, s3, 16 ; GFX11-NEXT: s_lshr_b32 s4, s2, 16 @@ -754,8 +754,8 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4 define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(ptr addrspace(1) %out, <8 x half> %arg) #0 { ; CI-LABEL: extload_v8f16_to_v8f64_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 -; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s6, s3, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s6 @@ -801,8 +801,8 @@ define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(ptr addrspace(1) %out, <8 ; ; VI-LABEL: extload_v8f16_to_v8f64_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s6, s0, 16 ; VI-NEXT: s_lshr_b32 s8, s2, 16 @@ -848,22 +848,20 @@ define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(ptr addrspace(1) %out, <8 ; ; GFX11-LABEL: extload_v8f16_to_v8f64_arg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshr_b32 s9, s7, 16 ; GFX11-NEXT: s_lshr_b32 s8, s6, 16 -; GFX11-NEXT: s_lshr_b32 s3, s5, 16 +; GFX11-NEXT: s_lshr_b32 s1, s5, 16 ; GFX11-NEXT: v_cvt_f32_f16_e32 v6, s7 ; GFX11-NEXT: v_cvt_f32_f16_e32 v11, s9 -; GFX11-NEXT: s_lshr_b32 s2, s4, 16 +; GFX11-NEXT: s_lshr_b32 s0, s4, 16 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, s6 ; GFX11-NEXT: v_cvt_f32_f16_e32 v10, s8 ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, s5 -; GFX11-NEXT: v_cvt_f32_f16_e32 v7, s3 +; GFX11-NEXT: v_cvt_f32_f16_e32 v7, s1 ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s4 -; GFX11-NEXT: v_cvt_f32_f16_e32 v16, s2 +; GFX11-NEXT: v_cvt_f32_f16_e32 v16, s0 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[12:13], v6 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[14:15], v11 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[8:9], v3 @@ -872,7 +870,9 @@ define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(ptr addrspace(1) %out, <8 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[6:7], v7 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v16 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v16, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: global_store_b128 v16, v[12:15], s[0:1] offset:48 ; GFX11-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:32 @@ -889,7 +889,7 @@ define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(ptr addrspace(1) %out, <8 define amdgpu_kernel void @global_load_store_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CIVI-LABEL: global_load_store_f16: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -902,7 +902,7 @@ define amdgpu_kernel void @global_load_store_f16(ptr addrspace(1) %out, ptr addr ; ; GFX11-LABEL: global_load_store_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] @@ -919,7 +919,7 @@ define amdgpu_kernel void @global_load_store_f16(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @global_load_store_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CIVI-LABEL: global_load_store_v2f16: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -932,7 +932,7 @@ define amdgpu_kernel void @global_load_store_v2f16(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: global_load_store_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -949,7 +949,7 @@ define amdgpu_kernel void @global_load_store_v2f16(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @global_load_store_v4f16(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 { ; CIVI-LABEL: global_load_store_v4f16: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s0 ; CIVI-NEXT: v_mov_b32_e32 v1, s1 @@ -962,7 +962,7 @@ define amdgpu_kernel void @global_load_store_v4f16(ptr addrspace(1) %in, ptr add ; ; GFX11-LABEL: global_load_store_v4f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -979,7 +979,7 @@ define amdgpu_kernel void @global_load_store_v4f16(ptr addrspace(1) %in, ptr add define amdgpu_kernel void @global_load_store_v8f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CIVI-LABEL: global_load_store_v8f16: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -992,7 +992,7 @@ define amdgpu_kernel void @global_load_store_v8f16(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: global_load_store_v8f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] @@ -1009,7 +1009,7 @@ define amdgpu_kernel void @global_load_store_v8f16(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @global_extload_f16_to_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CIVI-LABEL: global_extload_f16_to_f32: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -1023,7 +1023,7 @@ define amdgpu_kernel void @global_extload_f16_to_f32(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: global_extload_f16_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] @@ -1042,7 +1042,7 @@ define amdgpu_kernel void @global_extload_f16_to_f32(ptr addrspace(1) %out, ptr define amdgpu_kernel void @global_extload_v2f16_to_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_extload_v2f16_to_v2f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1058,7 +1058,7 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f32(ptr addrspace(1) %out, ; ; VI-LABEL: global_extload_v2f16_to_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1073,7 +1073,7 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f32(ptr addrspace(1) %out, ; ; GFX11-LABEL: global_extload_v2f16_to_v2f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v2, s[2:3] @@ -1095,7 +1095,7 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f32(ptr addrspace(1) %out, define amdgpu_kernel void @global_extload_v3f16_to_v3f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_extload_v3f16_to_v3f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1112,7 +1112,7 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f32(ptr addrspace(1) %out, ; ; VI-LABEL: global_extload_v3f16_to_v3f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1128,7 +1128,7 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f32(ptr addrspace(1) %out, ; ; GFX11-LABEL: global_extload_v3f16_to_v3f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v3, s[2:3] @@ -1151,7 +1151,7 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f32(ptr addrspace(1) %out, define amdgpu_kernel void @global_extload_v4f16_to_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_extload_v4f16_to_v4f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1170,7 +1170,7 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f32(ptr addrspace(1) %out, ; ; VI-LABEL: global_extload_v4f16_to_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1187,7 +1187,7 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f32(ptr addrspace(1) %out, ; ; GFX11-LABEL: global_extload_v4f16_to_v4f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v4, s[2:3] @@ -1212,7 +1212,7 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f32(ptr addrspace(1) %out, define amdgpu_kernel void @global_extload_v8f16_to_v8f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_extload_v8f16_to_v8f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1242,7 +1242,7 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f32(ptr addrspace(1) %out, ; ; VI-LABEL: global_extload_v8f16_to_v8f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1268,7 +1268,7 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f32(ptr addrspace(1) %out, ; ; GFX11-LABEL: global_extload_v8f16_to_v8f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v12, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v12, s[2:3] @@ -1300,7 +1300,7 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f32(ptr addrspace(1) %out, define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_extload_v16f16_to_v16f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s4, s2, 16 ; CI-NEXT: s_addc_u32 s5, s3, 0 @@ -1358,7 +1358,7 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out ; ; VI-LABEL: global_extload_v16f16_to_v16f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1408,7 +1408,7 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out ; ; GFX11-LABEL: global_extload_v16f16_to_v16f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v20, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -1457,7 +1457,7 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out define amdgpu_kernel void @global_extload_f16_to_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CIVI-LABEL: global_extload_f16_to_f64: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -1472,7 +1472,7 @@ define amdgpu_kernel void @global_extload_f16_to_f64(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: global_extload_f16_to_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v0, v2, s[2:3] @@ -1493,7 +1493,7 @@ define amdgpu_kernel void @global_extload_f16_to_f64(ptr addrspace(1) %out, ptr define amdgpu_kernel void @global_extload_v2f16_to_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_extload_v2f16_to_v2f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1511,7 +1511,7 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f64(ptr addrspace(1) %out, ; ; VI-LABEL: global_extload_v2f16_to_v2f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1528,7 +1528,7 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f64(ptr addrspace(1) %out, ; ; GFX11-LABEL: global_extload_v2f16_to_v2f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v4, s[2:3] @@ -1553,7 +1553,7 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f64(ptr addrspace(1) %out, define amdgpu_kernel void @global_extload_v3f16_to_v3f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_extload_v3f16_to_v3f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1578,7 +1578,7 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f64(ptr addrspace(1) %out, ; ; VI-LABEL: global_extload_v3f16_to_v3f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1602,7 +1602,7 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f64(ptr addrspace(1) %out, ; ; GFX11-LABEL: global_extload_v3f16_to_v3f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v6, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3] @@ -1631,7 +1631,7 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f64(ptr addrspace(1) %out, define amdgpu_kernel void @global_extload_v4f16_to_v4f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_extload_v4f16_to_v4f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1659,7 +1659,7 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f64(ptr addrspace(1) %out, ; ; VI-LABEL: global_extload_v4f16_to_v4f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1685,7 +1685,7 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f64(ptr addrspace(1) %out, ; ; GFX11-LABEL: global_extload_v4f16_to_v4f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v8, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v8, s[2:3] @@ -1718,7 +1718,7 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f64(ptr addrspace(1) %out, define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_extload_v8f16_to_v8f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1766,7 +1766,7 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out, ; ; VI-LABEL: global_extload_v8f16_to_v8f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1810,7 +1810,7 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out, ; ; GFX11-LABEL: global_extload_v8f16_to_v8f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v16, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v16, s[2:3] @@ -1852,7 +1852,7 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out, define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_extload_v16f16_to_v16f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1947,7 +1947,7 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out ; ; VI-LABEL: global_extload_v16f16_to_v16f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2033,7 +2033,7 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out ; ; GFX11-LABEL: global_extload_v16f16_to_v16f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -2102,7 +2102,7 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out define amdgpu_kernel void @global_truncstore_f32_to_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CIVI-LABEL: global_truncstore_f32_to_f16: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -2116,7 +2116,7 @@ define amdgpu_kernel void @global_truncstore_f32_to_f16(ptr addrspace(1) %out, p ; ; GFX11-LABEL: global_truncstore_f32_to_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2135,7 +2135,7 @@ define amdgpu_kernel void @global_truncstore_f32_to_f16(ptr addrspace(1) %out, p define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_truncstore_v2f32_to_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2152,7 +2152,7 @@ define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(ptr addrspace(1) %ou ; ; VI-LABEL: global_truncstore_v2f32_to_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2168,7 +2168,7 @@ define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(ptr addrspace(1) %ou ; ; GFX11-LABEL: global_truncstore_v2f32_to_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] @@ -2190,7 +2190,7 @@ define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(ptr addrspace(1) %ou define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_truncstore_v3f32_to_v3f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2213,7 +2213,7 @@ define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(ptr addrspace(1) %ou ; ; VI-LABEL: global_truncstore_v3f32_to_v3f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2235,7 +2235,7 @@ define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(ptr addrspace(1) %ou ; ; GFX11-LABEL: global_truncstore_v3f32_to_v3f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b96 v[0:2], v3, s[2:3] @@ -2260,7 +2260,7 @@ define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(ptr addrspace(1) %ou define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_truncstore_v4f32_to_v4f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2281,7 +2281,7 @@ define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(ptr addrspace(1) %ou ; ; VI-LABEL: global_truncstore_v4f32_to_v4f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2300,7 +2300,7 @@ define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(ptr addrspace(1) %ou ; ; GFX11-LABEL: global_truncstore_v4f32_to_v4f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] @@ -2325,7 +2325,7 @@ define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(ptr addrspace(1) %ou define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_truncstore_v8f32_to_v8f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2360,7 +2360,7 @@ define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(ptr addrspace(1) %ou ; ; VI-LABEL: global_truncstore_v8f32_to_v8f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2391,7 +2391,7 @@ define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(ptr addrspace(1) %ou ; ; GFX11-LABEL: global_truncstore_v8f32_to_v8f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v8, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -2425,7 +2425,7 @@ define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(ptr addrspace(1) %ou define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_truncstore_v16f32_to_v16f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s4, s2, 32 ; CI-NEXT: s_addc_u32 s5, s3, 0 @@ -2494,7 +2494,7 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) % ; ; VI-LABEL: global_truncstore_v16f32_to_v16f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 32 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -2554,7 +2554,7 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) % ; ; GFX11-LABEL: global_truncstore_v16f32_to_v16f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v16, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x3 @@ -2606,12 +2606,12 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) % define amdgpu_kernel void @fadd_f16(ptr addrspace(1) %out, half %a, half %b) #0 { ; CI-LABEL: fadd_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[4:5], 0x2 +; CI-NEXT: s_load_dword s0, s[6:7], 0x2 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 ; CI-NEXT: s_lshr_b32 s0, s0, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_add_f32_e32 v0, v0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2622,8 +2622,8 @@ define amdgpu_kernel void @fadd_f16(ptr addrspace(1) %out, half %a, half %b) #0 ; ; VI-LABEL: fadd_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s3 @@ -2636,13 +2636,13 @@ define amdgpu_kernel void @fadd_f16(ptr addrspace(1) %out, half %a, half %b) #0 ; GFX11-LABEL: fadd_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s3, s2, 16 +; GFX11-NEXT: s_lshr_b32 s2, s4, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_add_f16_e64 v1, s2, s3 +; GFX11-NEXT: v_add_f16_e64 v1, s4, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -2655,7 +2655,7 @@ define amdgpu_kernel void @fadd_f16(ptr addrspace(1) %out, half %a, half %b) #0 define amdgpu_kernel void @fadd_v2f16(ptr addrspace(1) %out, <2 x half> %a, <2 x half> %b) #0 { ; CI-LABEL: fadd_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s4, s2, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 @@ -2676,7 +2676,7 @@ define amdgpu_kernel void @fadd_v2f16(ptr addrspace(1) %out, <2 x half> %a, <2 x ; ; VI-LABEL: fadd_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s4, s3, 16 ; VI-NEXT: s_lshr_b32 s5, s2, 16 @@ -2693,7 +2693,7 @@ define amdgpu_kernel void @fadd_v2f16(ptr addrspace(1) %out, <2 x half> %a, <2 x ; ; GFX11-LABEL: fadd_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v1, s2, s3 @@ -2709,7 +2709,7 @@ define amdgpu_kernel void @fadd_v2f16(ptr addrspace(1) %out, <2 x half> %a, <2 x define amdgpu_kernel void @fadd_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: fadd_v4f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2746,7 +2746,7 @@ define amdgpu_kernel void @fadd_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: fadd_v4f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2765,7 +2765,7 @@ define amdgpu_kernel void @fadd_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX11-LABEL: fadd_v4f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] @@ -2787,8 +2787,8 @@ define amdgpu_kernel void @fadd_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @fadd_v8f16(ptr addrspace(1) %out, <8 x half> %a, <8 x half> %b) #0 { ; CI-LABEL: fadd_v8f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x4 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s2, s8, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 @@ -2845,8 +2845,8 @@ define amdgpu_kernel void @fadd_v8f16(ptr addrspace(1) %out, <8 x half> %a, <8 x ; ; VI-LABEL: fadd_v8f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s2, s15, 16 ; VI-NEXT: s_lshr_b32 s3, s11, 16 @@ -2888,8 +2888,8 @@ define amdgpu_kernel void @fadd_v8f16(ptr addrspace(1) %out, <8 x half> %a, <8 x ; GFX11-LABEL: fadd_v8f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x10 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x10 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v3, s7, s11 @@ -2908,7 +2908,7 @@ define amdgpu_kernel void @fadd_v8f16(ptr addrspace(1) %out, <8 x half> %a, <8 x define amdgpu_kernel void @test_bitcast_from_half(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 { ; CIVI-LABEL: test_bitcast_from_half: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s0 ; CIVI-NEXT: v_mov_b32_e32 v1, s1 @@ -2921,7 +2921,7 @@ define amdgpu_kernel void @test_bitcast_from_half(ptr addrspace(1) %in, ptr addr ; ; GFX11-LABEL: test_bitcast_from_half: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[0:1] @@ -2939,7 +2939,7 @@ define amdgpu_kernel void @test_bitcast_from_half(ptr addrspace(1) %in, ptr addr define amdgpu_kernel void @test_bitcast_to_half(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CIVI-LABEL: test_bitcast_to_half: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -2952,7 +2952,7 @@ define amdgpu_kernel void @test_bitcast_to_half(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: test_bitcast_to_half: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll index f736ca7cd625a..b62bf890e65fe 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll @@ -4,22 +4,22 @@ define amdgpu_kernel void @float4_inselt(ptr addrspace(1) %out, <4 x float> %vec, i32 %sel) { ; GCN-LABEL: float4_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s2, s[0:1], 0x44 -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s8, s[2:3], 0x44 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s2, 3 +; GCN-NEXT: s_cmp_lg_u32 s8, 3 ; GCN-NEXT: v_mov_b32_e32 v0, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 2 +; GCN-NEXT: s_cmp_lg_u32 s8, 2 ; GCN-NEXT: v_cndmask_b32_e32 v3, 1.0, v0, vcc ; GCN-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 1 +; GCN-NEXT: s_cmp_lg_u32 s8, 1 ; GCN-NEXT: v_cndmask_b32_e32 v2, 1.0, v0, vcc ; GCN-NEXT: v_mov_b32_e32 v0, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 0 +; GCN-NEXT: s_cmp_lg_u32 s8, 0 ; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v0, vcc ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -37,7 +37,7 @@ entry: define amdgpu_kernel void @float4_inselt_undef(ptr addrspace(1) %out, i32 %sel) { ; GCN-LABEL: float4_inselt_undef: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 1.0 ; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: v_mov_b32_e32 v2, v0 @@ -56,23 +56,23 @@ entry: define amdgpu_kernel void @int4_inselt(ptr addrspace(1) %out, <4 x i32> %vec, i32 %sel) { ; GCN-LABEL: int4_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s2, s[0:1], 0x44 -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s8, s[2:3], 0x44 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s2, 3 -; GCN-NEXT: s_cselect_b32 s3, s7, 1 -; GCN-NEXT: s_cmp_lg_u32 s2, 2 -; GCN-NEXT: s_cselect_b32 s6, s6, 1 -; GCN-NEXT: s_cmp_lg_u32 s2, 1 +; GCN-NEXT: s_cmp_lg_u32 s8, 3 +; GCN-NEXT: s_cselect_b32 s2, s7, 1 +; GCN-NEXT: s_cmp_lg_u32 s8, 2 +; GCN-NEXT: s_cselect_b32 s3, s6, 1 +; GCN-NEXT: s_cmp_lg_u32 s8, 1 ; GCN-NEXT: s_cselect_b32 s5, s5, 1 -; GCN-NEXT: s_cmp_lg_u32 s2, 0 -; GCN-NEXT: s_cselect_b32 s2, s4, 1 +; GCN-NEXT: s_cmp_lg_u32 s8, 0 +; GCN-NEXT: s_cselect_b32 s4, s4, 1 ; GCN-NEXT: v_mov_b32_e32 v5, s1 -; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: v_mov_b32_e32 v2, s3 +; GCN-NEXT: v_mov_b32_e32 v3, s2 ; GCN-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm @@ -85,15 +85,15 @@ entry: define amdgpu_kernel void @float2_inselt(ptr addrspace(1) %out, <2 x float> %vec, i32 %sel) { ; GCN-LABEL: float2_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s4, s[0:1], 0x34 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s4, 1 -; GCN-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NEXT: s_cmp_lg_u32 s6, 1 +; GCN-NEXT: v_mov_b32_e32 v0, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s4, 0 +; GCN-NEXT: s_cmp_lg_u32 s6, 0 ; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_mov_b32_e32 v3, s1 ; GCN-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc @@ -109,21 +109,21 @@ entry: define amdgpu_kernel void @float8_inselt(ptr addrspace(1) %out, <8 x float> %vec, i32 %sel) { ; GCN-LABEL: float8_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 -; GCN-NEXT: s_load_dword s2, s[0:1], 0x64 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 +; GCN-NEXT: s_load_dword s12, s[2:3], 0x64 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: s_mov_b32 m0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: s_add_u32 s2, s0, 16 ; GCN-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: v_mov_b32_e32 v3, s7 ; GCN-NEXT: v_mov_b32_e32 v4, s8 ; GCN-NEXT: v_mov_b32_e32 v5, s9 ; GCN-NEXT: v_mov_b32_e32 v6, s10 ; GCN-NEXT: v_mov_b32_e32 v7, s11 +; GCN-NEXT: s_mov_b32 m0, s12 ; GCN-NEXT: v_mov_b32_e32 v9, s3 ; GCN-NEXT: v_movreld_b32_e32 v0, 1.0 ; GCN-NEXT: v_mov_b32_e32 v8, s2 @@ -142,14 +142,14 @@ entry: define amdgpu_kernel void @float16_inselt(ptr addrspace(1) %out, <16 x float> %vec, i32 %sel) { ; GCN-LABEL: float16_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 -; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN-NEXT: s_load_dword s20, s[0:1], 0xa4 +; GCN-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s20, s[2:3], 0xa4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: s_add_u32 s0, s2, 48 -; GCN-NEXT: s_addc_u32 s1, s3, 0 -; GCN-NEXT: v_mov_b32_e32 v17, s1 +; GCN-NEXT: s_add_u32 s2, s0, 48 +; GCN-NEXT: s_addc_u32 s3, s1, 0 +; GCN-NEXT: v_mov_b32_e32 v17, s3 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: v_mov_b32_e32 v3, s7 @@ -166,24 +166,24 @@ define amdgpu_kernel void @float16_inselt(ptr addrspace(1) %out, <16 x float> %v ; GCN-NEXT: v_mov_b32_e32 v14, s18 ; GCN-NEXT: v_mov_b32_e32 v15, s19 ; GCN-NEXT: s_mov_b32 m0, s20 -; GCN-NEXT: v_mov_b32_e32 v16, s0 -; GCN-NEXT: s_add_u32 s0, s2, 32 +; GCN-NEXT: v_mov_b32_e32 v16, s2 +; GCN-NEXT: s_add_u32 s2, s0, 32 ; GCN-NEXT: v_movreld_b32_e32 v0, 1.0 -; GCN-NEXT: s_addc_u32 s1, s3, 0 +; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v13, s1 -; GCN-NEXT: v_mov_b32_e32 v12, s0 -; GCN-NEXT: s_add_u32 s0, s2, 16 -; GCN-NEXT: s_addc_u32 s1, s3, 0 +; GCN-NEXT: v_mov_b32_e32 v13, s3 +; GCN-NEXT: v_mov_b32_e32 v12, s2 +; GCN-NEXT: s_add_u32 s2, s0, 16 +; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v9, s1 -; GCN-NEXT: v_mov_b32_e32 v8, s0 +; GCN-NEXT: v_mov_b32_e32 v9, s3 +; GCN-NEXT: v_mov_b32_e32 v8, s2 ; GCN-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v5, s3 -; GCN-NEXT: v_mov_b32_e32 v4, s2 +; GCN-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm entry: @@ -195,18 +195,18 @@ entry: define amdgpu_kernel void @float32_inselt(ptr addrspace(1) %out, <32 x float> %vec, i32 %sel) { ; GCN-LABEL: float32_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0xa4 -; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0xe4 -; GCN-NEXT: s_load_dword s0, s[0:1], 0x124 +; GCN-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0xa4 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0xe4 +; GCN-NEXT: s_load_dword s2, s[2:3], 0x124 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s36 ; GCN-NEXT: v_mov_b32_e32 v1, s37 ; GCN-NEXT: v_mov_b32_e32 v2, s38 -; GCN-NEXT: s_mov_b32 m0, s0 -; GCN-NEXT: s_add_u32 s0, s2, 0x70 -; GCN-NEXT: s_addc_u32 s1, s3, 0 -; GCN-NEXT: v_mov_b32_e32 v33, s1 +; GCN-NEXT: s_mov_b32 m0, s2 +; GCN-NEXT: s_add_u32 s2, s0, 0x70 +; GCN-NEXT: s_addc_u32 s3, s1, 0 +; GCN-NEXT: v_mov_b32_e32 v33, s3 ; GCN-NEXT: v_mov_b32_e32 v3, s39 ; GCN-NEXT: v_mov_b32_e32 v4, s40 ; GCN-NEXT: v_mov_b32_e32 v5, s41 @@ -236,48 +236,48 @@ define amdgpu_kernel void @float32_inselt(ptr addrspace(1) %out, <32 x float> %v ; GCN-NEXT: v_mov_b32_e32 v29, s17 ; GCN-NEXT: v_mov_b32_e32 v30, s18 ; GCN-NEXT: v_mov_b32_e32 v31, s19 -; GCN-NEXT: v_mov_b32_e32 v32, s0 -; GCN-NEXT: s_add_u32 s0, s2, 0x60 +; GCN-NEXT: v_mov_b32_e32 v32, s2 +; GCN-NEXT: s_add_u32 s2, s0, 0x60 ; GCN-NEXT: v_movreld_b32_e32 v0, 1.0 -; GCN-NEXT: s_addc_u32 s1, s3, 0 +; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: flat_store_dwordx4 v[32:33], v[28:31] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v29, s1 -; GCN-NEXT: v_mov_b32_e32 v28, s0 -; GCN-NEXT: s_add_u32 s0, s2, 0x50 -; GCN-NEXT: s_addc_u32 s1, s3, 0 +; GCN-NEXT: v_mov_b32_e32 v29, s3 +; GCN-NEXT: v_mov_b32_e32 v28, s2 +; GCN-NEXT: s_add_u32 s2, s0, 0x50 +; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: flat_store_dwordx4 v[28:29], v[24:27] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v25, s1 -; GCN-NEXT: v_mov_b32_e32 v24, s0 -; GCN-NEXT: s_add_u32 s0, s2, 64 -; GCN-NEXT: s_addc_u32 s1, s3, 0 +; GCN-NEXT: v_mov_b32_e32 v25, s3 +; GCN-NEXT: v_mov_b32_e32 v24, s2 +; GCN-NEXT: s_add_u32 s2, s0, 64 +; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: flat_store_dwordx4 v[24:25], v[20:23] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v21, s1 -; GCN-NEXT: v_mov_b32_e32 v20, s0 -; GCN-NEXT: s_add_u32 s0, s2, 48 -; GCN-NEXT: s_addc_u32 s1, s3, 0 +; GCN-NEXT: v_mov_b32_e32 v21, s3 +; GCN-NEXT: v_mov_b32_e32 v20, s2 +; GCN-NEXT: s_add_u32 s2, s0, 48 +; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: flat_store_dwordx4 v[20:21], v[16:19] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v17, s1 -; GCN-NEXT: v_mov_b32_e32 v16, s0 -; GCN-NEXT: s_add_u32 s0, s2, 32 -; GCN-NEXT: s_addc_u32 s1, s3, 0 +; GCN-NEXT: v_mov_b32_e32 v17, s3 +; GCN-NEXT: v_mov_b32_e32 v16, s2 +; GCN-NEXT: s_add_u32 s2, s0, 32 +; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v13, s1 -; GCN-NEXT: v_mov_b32_e32 v12, s0 -; GCN-NEXT: s_add_u32 s0, s2, 16 -; GCN-NEXT: s_addc_u32 s1, s3, 0 +; GCN-NEXT: v_mov_b32_e32 v13, s3 +; GCN-NEXT: v_mov_b32_e32 v12, s2 +; GCN-NEXT: s_add_u32 s2, s0, 16 +; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v9, s1 -; GCN-NEXT: v_mov_b32_e32 v8, s0 +; GCN-NEXT: v_mov_b32_e32 v9, s3 +; GCN-NEXT: v_mov_b32_e32 v8, s2 ; GCN-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v5, s3 -; GCN-NEXT: v_mov_b32_e32 v4, s2 +; GCN-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm entry: @@ -289,8 +289,8 @@ entry: define amdgpu_kernel void @half4_inselt(ptr addrspace(1) %out, <4 x half> %vec, i32 %sel) { ; GCN-LABEL: half4_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s6, s[0:1], 0x34 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s6, s[2:3], 0x34 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: s_mov_b32 s4, 0x3c003c00 ; GCN-NEXT: s_mov_b32 s5, s4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -314,7 +314,7 @@ entry: define amdgpu_kernel void @half2_inselt(ptr addrspace(1) %out, <2 x half> %vec, i32 %sel) { ; GCN-LABEL: half2_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshl_b32 s3, s3, 4 ; GCN-NEXT: s_lshl_b32 s3, 0xffff, s3 @@ -335,49 +335,49 @@ entry: define amdgpu_kernel void @half8_inselt(ptr addrspace(1) %out, <8 x half> %vec, i32 %sel) { ; GCN-LABEL: half8_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GCN-NEXT: s_load_dword s2, s[0:1], 0x44 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GCN-NEXT: s_load_dword s8, s[2:3], 0x44 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 0x3c00 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s3, s7, 16 -; GCN-NEXT: s_cmp_lg_u32 s2, 7 -; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: s_lshr_b32 s2, s7, 16 +; GCN-NEXT: s_cmp_lg_u32 s8, 7 +; GCN-NEXT: v_mov_b32_e32 v1, s2 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 6 +; GCN-NEXT: s_cmp_lg_u32 s8, 6 ; GCN-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc ; GCN-NEXT: v_mov_b32_e32 v2, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_lshr_b32 s3, s6, 16 +; GCN-NEXT: s_lshr_b32 s2, s6, 16 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc -; GCN-NEXT: s_cmp_lg_u32 s2, 5 +; GCN-NEXT: s_cmp_lg_u32 s8, 5 ; GCN-NEXT: v_or_b32_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v1, s2 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 4 +; GCN-NEXT: s_cmp_lg_u32 s8, 4 ; GCN-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc ; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_lshr_b32 s3, s5, 16 +; GCN-NEXT: s_lshr_b32 s2, s5, 16 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc -; GCN-NEXT: s_cmp_lg_u32 s2, 3 +; GCN-NEXT: s_cmp_lg_u32 s8, 3 ; GCN-NEXT: v_or_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v1, s2 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 2 +; GCN-NEXT: s_cmp_lg_u32 s8, 2 ; GCN-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc ; GCN-NEXT: v_mov_b32_e32 v4, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_lshr_b32 s3, s4, 16 +; GCN-NEXT: s_lshr_b32 s2, s4, 16 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc -; GCN-NEXT: s_cmp_lg_u32 s2, 1 +; GCN-NEXT: s_cmp_lg_u32 s8, 1 ; GCN-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v4, s3 +; GCN-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 0 +; GCN-NEXT: s_cmp_lg_u32 s8, 0 ; GCN-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc ; GCN-NEXT: v_mov_b32_e32 v5, s4 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -397,7 +397,7 @@ entry: define amdgpu_kernel void @short2_inselt(ptr addrspace(1) %out, <2 x i16> %vec, i32 %sel) { ; GCN-LABEL: short2_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshl_b32 s3, s3, 4 ; GCN-NEXT: s_lshl_b32 s3, 0xffff, s3 @@ -418,8 +418,8 @@ entry: define amdgpu_kernel void @short4_inselt(ptr addrspace(1) %out, <4 x i16> %vec, i32 %sel) { ; GCN-LABEL: short4_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s6, s[0:1], 0x34 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s6, s[2:3], 0x34 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: s_mov_b32 s4, 0x10001 ; GCN-NEXT: s_mov_b32 s5, s4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -443,8 +443,8 @@ entry: define amdgpu_kernel void @byte8_inselt(ptr addrspace(1) %out, <8 x i8> %vec, i32 %sel) { ; GCN-LABEL: byte8_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s4, s[0:1], 0x34 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[2:3], 0x34 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshl_b32 s4, s4, 3 ; GCN-NEXT: s_lshl_b64 s[4:5], 0xff, s4 @@ -467,99 +467,99 @@ entry: define amdgpu_kernel void @byte16_inselt(ptr addrspace(1) %out, <16 x i8> %vec, i32 %sel) { ; GCN-LABEL: byte16_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GCN-NEXT: s_load_dword s2, s[0:1], 0x44 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GCN-NEXT: s_load_dword s8, s[2:3], 0x44 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s3, s7, 24 -; GCN-NEXT: s_cmp_lg_u32 s2, 15 -; GCN-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NEXT: s_lshr_b32 s2, s7, 24 +; GCN-NEXT: s_cmp_lg_u32 s8, 15 +; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_lshr_b32 s3, s7, 16 -; GCN-NEXT: s_cmp_lg_u32 s2, 14 +; GCN-NEXT: s_lshr_b32 s2, s7, 16 +; GCN-NEXT: s_cmp_lg_u32 s8, 14 ; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v1, s2 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_lshr_b32 s3, s7, 8 +; GCN-NEXT: s_lshr_b32 s2, s7, 8 ; GCN-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GCN-NEXT: v_cndmask_b32_e32 v1, 1, v1, vcc -; GCN-NEXT: s_cmp_lg_u32 s2, 13 +; GCN-NEXT: s_cmp_lg_u32 s8, 13 ; GCN-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v1, s2 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 12 +; GCN-NEXT: s_cmp_lg_u32 s8, 12 ; GCN-NEXT: v_cndmask_b32_e32 v1, 1, v1, vcc ; GCN-NEXT: v_mov_b32_e32 v2, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GCN-NEXT: v_cndmask_b32_e32 v2, 1, v2, vcc -; GCN-NEXT: s_lshr_b32 s3, s6, 24 +; GCN-NEXT: s_lshr_b32 s2, s6, 24 ; GCN-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NEXT: s_cmp_lg_u32 s2, 11 +; GCN-NEXT: s_cmp_lg_u32 s8, 11 ; GCN-NEXT: v_or_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_lshr_b32 s3, s6, 16 -; GCN-NEXT: s_cmp_lg_u32 s2, 10 +; GCN-NEXT: s_lshr_b32 s2, s6, 16 +; GCN-NEXT: s_cmp_lg_u32 s8, 10 ; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v1, s2 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_lshr_b32 s3, s6, 8 +; GCN-NEXT: s_lshr_b32 s2, s6, 8 ; GCN-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GCN-NEXT: v_cndmask_b32_e32 v1, 1, v1, vcc -; GCN-NEXT: s_cmp_lg_u32 s2, 9 +; GCN-NEXT: s_cmp_lg_u32 s8, 9 ; GCN-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v1, s2 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 8 +; GCN-NEXT: s_cmp_lg_u32 s8, 8 ; GCN-NEXT: v_cndmask_b32_e32 v1, 1, v1, vcc ; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GCN-NEXT: v_cndmask_b32_e32 v2, 1, v2, vcc -; GCN-NEXT: s_lshr_b32 s3, s5, 24 +; GCN-NEXT: s_lshr_b32 s2, s5, 24 ; GCN-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NEXT: s_cmp_lg_u32 s2, 7 +; GCN-NEXT: s_cmp_lg_u32 s8, 7 ; GCN-NEXT: v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_lshr_b32 s3, s5, 16 -; GCN-NEXT: s_cmp_lg_u32 s2, 6 +; GCN-NEXT: s_lshr_b32 s2, s5, 16 +; GCN-NEXT: s_cmp_lg_u32 s8, 6 ; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v1, s2 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_lshr_b32 s3, s5, 8 +; GCN-NEXT: s_lshr_b32 s2, s5, 8 ; GCN-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GCN-NEXT: v_cndmask_b32_e32 v1, 1, v1, vcc -; GCN-NEXT: s_cmp_lg_u32 s2, 5 +; GCN-NEXT: s_cmp_lg_u32 s8, 5 ; GCN-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v1, s2 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 4 +; GCN-NEXT: s_cmp_lg_u32 s8, 4 ; GCN-NEXT: v_cndmask_b32_e32 v1, 1, v1, vcc ; GCN-NEXT: v_mov_b32_e32 v4, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GCN-NEXT: v_cndmask_b32_e32 v4, 1, v4, vcc -; GCN-NEXT: s_lshr_b32 s3, s4, 24 +; GCN-NEXT: s_lshr_b32 s2, s4, 24 ; GCN-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NEXT: s_cmp_lg_u32 s2, 3 +; GCN-NEXT: s_cmp_lg_u32 s8, 3 ; GCN-NEXT: v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_lshr_b32 s3, s4, 16 -; GCN-NEXT: s_cmp_lg_u32 s2, 2 +; GCN-NEXT: s_lshr_b32 s2, s4, 16 +; GCN-NEXT: s_cmp_lg_u32 s8, 2 ; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v4, s3 +; GCN-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_lshr_b32 s3, s4, 8 +; GCN-NEXT: s_lshr_b32 s2, s4, 8 ; GCN-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GCN-NEXT: v_cndmask_b32_e32 v4, 1, v4, vcc -; GCN-NEXT: s_cmp_lg_u32 s2, 1 +; GCN-NEXT: s_cmp_lg_u32 s8, 1 ; GCN-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v4, s3 +; GCN-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 0 +; GCN-NEXT: s_cmp_lg_u32 s8, 0 ; GCN-NEXT: v_cndmask_b32_e32 v4, 1, v4, vcc ; GCN-NEXT: v_mov_b32_e32 v5, s4 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -580,21 +580,21 @@ entry: define amdgpu_kernel void @double2_inselt(ptr addrspace(1) %out, <2 x double> %vec, i32 %sel) { ; GCN-LABEL: double2_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s2, s[0:1], 0x44 -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s8, s[2:3], 0x44 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_eq_u32 s2, 1 -; GCN-NEXT: s_cselect_b32 s3, 0x3ff00000, s7 -; GCN-NEXT: s_cselect_b32 s6, 0, s6 -; GCN-NEXT: s_cmp_eq_u32 s2, 0 -; GCN-NEXT: s_cselect_b32 s2, 0x3ff00000, s5 +; GCN-NEXT: s_cmp_eq_u32 s8, 1 +; GCN-NEXT: s_cselect_b32 s2, 0x3ff00000, s7 +; GCN-NEXT: s_cselect_b32 s3, 0, s6 +; GCN-NEXT: s_cmp_eq_u32 s8, 0 +; GCN-NEXT: s_cselect_b32 s5, 0x3ff00000, s5 ; GCN-NEXT: s_cselect_b32 s4, 0, s4 ; GCN-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s2 -; GCN-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: v_mov_b32_e32 v2, s3 +; GCN-NEXT: v_mov_b32_e32 v3, s2 ; GCN-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm @@ -607,10 +607,10 @@ entry: define amdgpu_kernel void @double5_inselt(ptr addrspace(1) %out, <5 x double> %vec, i32 %sel) { ; GCN-LABEL: double5_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s12, s[0:1], 0xa4 -; GCN-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x84 -; GCN-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x24 -; GCN-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x64 +; GCN-NEXT: s_load_dword s12, s[2:3], 0xa4 +; GCN-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x84 +; GCN-NEXT: s_load_dwordx2 s[10:11], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x64 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_eq_u32 s12, 4 ; GCN-NEXT: s_cselect_b32 s9, 0x3ff00000, s9 @@ -661,12 +661,12 @@ entry: define amdgpu_kernel void @double8_inselt(ptr addrspace(1) %out, <8 x double> %vec, i32 %sel) { ; GCN-LABEL: double8_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s2, s[0:1], 0xa4 -; GCN-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s20, s[2:3], 0xa4 +; GCN-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: v_mov_b32_e32 v16, 0x3ff00000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s2, s2, 1 +; GCN-NEXT: s_lshl_b32 s2, s20, 1 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: v_mov_b32_e32 v2, s6 @@ -717,17 +717,17 @@ entry: define amdgpu_kernel void @double7_inselt(ptr addrspace(1) %out, <7 x double> %vec, i32 %sel) { ; GCN-LABEL: double7_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x64 -; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x94 -; GCN-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x84 -; GCN-NEXT: s_load_dword s0, s[0:1], 0xa4 +; GCN-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x64 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x94 +; GCN-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x84 +; GCN-NEXT: s_load_dword s2, s[2:3], 0xa4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: v_mov_b32_e32 v3, s7 -; GCN-NEXT: s_lshl_b32 s0, s0, 1 +; GCN-NEXT: s_lshl_b32 s2, s2, 1 ; GCN-NEXT: v_mov_b32_e32 v4, s8 ; GCN-NEXT: v_mov_b32_e32 v5, s9 ; GCN-NEXT: v_mov_b32_e32 v6, s10 @@ -738,25 +738,25 @@ define amdgpu_kernel void @double7_inselt(ptr addrspace(1) %out, <7 x double> %v ; GCN-NEXT: v_mov_b32_e32 v11, s15 ; GCN-NEXT: v_mov_b32_e32 v12, s16 ; GCN-NEXT: v_mov_b32_e32 v13, s17 -; GCN-NEXT: s_mov_b32 m0, s0 +; GCN-NEXT: s_mov_b32 m0, s2 ; GCN-NEXT: v_movreld_b32_e32 v0, 0 ; GCN-NEXT: v_mov_b32_e32 v16, 0x3ff00000 -; GCN-NEXT: s_add_u32 s0, s2, 16 +; GCN-NEXT: s_add_u32 s2, s0, 16 ; GCN-NEXT: v_movreld_b32_e32 v1, v16 -; GCN-NEXT: s_addc_u32 s1, s3, 0 -; GCN-NEXT: v_mov_b32_e32 v15, s1 -; GCN-NEXT: v_mov_b32_e32 v14, s0 +; GCN-NEXT: s_addc_u32 s3, s1, 0 +; GCN-NEXT: v_mov_b32_e32 v15, s3 +; GCN-NEXT: v_mov_b32_e32 v14, s2 ; GCN-NEXT: flat_store_dwordx4 v[14:15], v[4:7] -; GCN-NEXT: s_add_u32 s0, s2, 48 -; GCN-NEXT: v_mov_b32_e32 v5, s3 -; GCN-NEXT: v_mov_b32_e32 v4, s2 +; GCN-NEXT: s_add_u32 s2, s0, 48 +; GCN-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NEXT: s_addc_u32 s1, s3, 0 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: s_add_u32 s0, s2, 32 +; GCN-NEXT: s_addc_u32 s3, s1, 0 +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: s_add_u32 s0, s0, 32 ; GCN-NEXT: flat_store_dwordx2 v[0:1], v[12:13] -; GCN-NEXT: s_addc_u32 s1, s3, 0 +; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: flat_store_dwordx4 v[0:1], v[8:11] @@ -770,14 +770,15 @@ entry: define amdgpu_kernel void @double16_inselt(ptr addrspace(1) %out, <16 x double> %vec, i32 %sel) { ; GCN-LABEL: double16_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s2, s[0:1], 0x124 -; GCN-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0xa4 -; GCN-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0xe4 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s0, s[2:3], 0x124 +; GCN-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0xa4 +; GCN-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0xe4 ; GCN-NEXT: v_mov_b32_e32 v32, 0x3ff00000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s36 -; GCN-NEXT: s_lshl_b32 s2, s2, 1 +; GCN-NEXT: s_lshl_b32 s0, s0, 1 +; GCN-NEXT: s_mov_b32 m0, s0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: v_mov_b32_e32 v1, s37 ; GCN-NEXT: v_mov_b32_e32 v2, s38 ; GCN-NEXT: v_mov_b32_e32 v3, s39 @@ -809,7 +810,7 @@ define amdgpu_kernel void @double16_inselt(ptr addrspace(1) %out, <16 x double> ; GCN-NEXT: v_mov_b32_e32 v29, s17 ; GCN-NEXT: v_mov_b32_e32 v30, s18 ; GCN-NEXT: v_mov_b32_e32 v31, s19 -; GCN-NEXT: s_mov_b32 m0, s2 +; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_add_u32 s2, s0, 0x70 ; GCN-NEXT: v_movreld_b32_e32 v0, 0 ; GCN-NEXT: s_addc_u32 s3, s1, 0 @@ -867,20 +868,22 @@ entry: define amdgpu_kernel void @double15_inselt(ptr addrspace(1) %out, <15 x double> %vec, i32 %sel) { ; GCN-LABEL: double15_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0xa4 -; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x114 -; GCN-NEXT: s_load_dwordx4 s[20:23], s[0:1], 0x104 -; GCN-NEXT: s_load_dwordx8 s[24:31], s[0:1], 0xe4 +; GCN-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0xa4 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x114 +; GCN-NEXT: s_load_dwordx4 s[20:23], s[2:3], 0x104 +; GCN-NEXT: s_load_dwordx8 s[24:31], s[2:3], 0xe4 ; GCN-NEXT: v_mov_b32_e32 v32, 0x3ff00000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: s_load_dword s4, s[0:1], 0x124 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GCN-NEXT: v_mov_b32_e32 v28, s2 +; GCN-NEXT: s_load_dword s4, s[2:3], 0x124 +; GCN-NEXT: v_mov_b32_e32 v28, s0 +; GCN-NEXT: v_mov_b32_e32 v29, s1 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s2, s4, 1 +; GCN-NEXT: s_lshl_b32 s0, s4, 1 +; GCN-NEXT: s_mov_b32 m0, s0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: v_mov_b32_e32 v3, s7 ; GCN-NEXT: v_mov_b32_e32 v4, s8 ; GCN-NEXT: v_mov_b32_e32 v5, s9 @@ -906,9 +909,8 @@ define amdgpu_kernel void @double15_inselt(ptr addrspace(1) %out, <15 x double> ; GCN-NEXT: v_mov_b32_e32 v25, s21 ; GCN-NEXT: v_mov_b32_e32 v26, s22 ; GCN-NEXT: v_mov_b32_e32 v27, s23 -; GCN-NEXT: v_mov_b32_e32 v29, s3 -; GCN-NEXT: s_mov_b32 m0, s2 ; GCN-NEXT: v_movreld_b32_e32 v0, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_add_u32 s2, s0, 0x50 ; GCN-NEXT: v_movreld_b32_e32 v1, v32 ; GCN-NEXT: s_addc_u32 s3, s1, 0 @@ -962,13 +964,13 @@ entry: define amdgpu_kernel void @bit4_inselt(ptr addrspace(1) %out, <4 x i1> %vec, i32 %sel) { ; GCN-LABEL: bit4_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 -; GCN-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 -; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: s_mov_b32 s7, 0xe80000 -; GCN-NEXT: s_add_u32 s4, s4, s3 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GCN-NEXT: s_addc_u32 s5, s5, 0 +; GCN-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN-NEXT: s_mov_b32 s14, -1 +; GCN-NEXT: s_mov_b32 s15, 0xe80000 +; GCN-NEXT: s_add_u32 s12, s12, s9 +; GCN-NEXT: s_addc_u32 s13, s13, 0 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_and_b32 s3, s3, 3 @@ -980,16 +982,16 @@ define amdgpu_kernel void @bit4_inselt(ptr addrspace(1) %out, <4 x i1> %vec, i32 ; GCN-NEXT: v_and_b32_e32 v2, 1, v2 ; GCN-NEXT: v_and_b32_e32 v3, 3, v3 ; GCN-NEXT: v_and_b32_e32 v4, 1, v4 -; GCN-NEXT: buffer_store_byte v1, off, s[4:7], 0 -; GCN-NEXT: buffer_store_byte v4, off, s[4:7], 0 offset:3 -; GCN-NEXT: buffer_store_byte v3, off, s[4:7], 0 offset:2 -; GCN-NEXT: buffer_store_byte v2, off, s[4:7], 0 offset:1 +; GCN-NEXT: buffer_store_byte v1, off, s[12:15], 0 +; GCN-NEXT: buffer_store_byte v4, off, s[12:15], 0 offset:3 +; GCN-NEXT: buffer_store_byte v3, off, s[12:15], 0 offset:2 +; GCN-NEXT: buffer_store_byte v2, off, s[12:15], 0 offset:1 ; GCN-NEXT: v_mov_b32_e32 v1, 1 -; GCN-NEXT: buffer_store_byte v1, v0, s[4:7], 0 offen -; GCN-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 -; GCN-NEXT: buffer_load_ubyte v1, off, s[4:7], 0 offset:1 -; GCN-NEXT: buffer_load_ubyte v2, off, s[4:7], 0 offset:2 -; GCN-NEXT: buffer_load_ubyte v3, off, s[4:7], 0 offset:3 +; GCN-NEXT: buffer_store_byte v1, v0, s[12:15], 0 offen +; GCN-NEXT: buffer_load_ubyte v0, off, s[12:15], 0 +; GCN-NEXT: buffer_load_ubyte v1, off, s[12:15], 0 offset:1 +; GCN-NEXT: buffer_load_ubyte v2, off, s[12:15], 0 offset:2 +; GCN-NEXT: buffer_load_ubyte v3, off, s[12:15], 0 offset:3 ; GCN-NEXT: s_waitcnt vmcnt(3) ; GCN-NEXT: v_and_b32_e32 v0, 1, v0 ; GCN-NEXT: s_waitcnt vmcnt(2) @@ -1017,11 +1019,11 @@ entry: define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, i32 %sel) { ; GCN-LABEL: bit128_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN-NEXT: s_load_dword s0, s[0:1], 0x44 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s2, s[2:3], 0x44 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s1, s4, 24 +; GCN-NEXT: s_lshr_b32 s3, s4, 24 ; GCN-NEXT: s_lshr_b32 s8, s4, 16 ; GCN-NEXT: s_lshr_b32 s9, s4, 17 ; GCN-NEXT: s_lshr_b32 s10, s4, 18 @@ -1057,10 +1059,10 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: s_lshr_b32 s41, s7, 21 ; GCN-NEXT: s_lshr_b32 s42, s7, 22 ; GCN-NEXT: s_lshr_b32 s43, s7, 23 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x77 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x77 ; GCN-NEXT: v_mov_b32_e32 v15, s43 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x76 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x76 ; GCN-NEXT: v_cndmask_b32_e32 v15, 1, v15, vcc ; GCN-NEXT: v_mov_b32_e32 v18, s42 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1068,11 +1070,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v18, 1, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v15, 3, v15 ; GCN-NEXT: v_lshlrev_b16_e32 v18, 2, v18 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x75 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x75 ; GCN-NEXT: v_or_b32_e32 v15, v15, v18 ; GCN-NEXT: v_mov_b32_e32 v18, s41 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x74 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x74 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc ; GCN-NEXT: v_mov_b32_e32 v19, s40 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1081,11 +1083,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 ; GCN-NEXT: v_or_b32_e32 v18, v19, v18 ; GCN-NEXT: v_and_b32_e32 v18, 3, v18 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x73 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x73 ; GCN-NEXT: v_or_b32_e32 v15, v18, v15 ; GCN-NEXT: v_mov_b32_e32 v18, s39 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x72 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x72 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc ; GCN-NEXT: v_mov_b32_e32 v19, s38 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1093,11 +1095,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 ; GCN-NEXT: v_lshlrev_b16_e32 v18, 3, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x71 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x71 ; GCN-NEXT: v_or_b32_e32 v18, v18, v19 ; GCN-NEXT: v_mov_b32_e32 v19, s37 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x70 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x70 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc ; GCN-NEXT: v_mov_b32_e32 v20, s36 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1109,11 +1111,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_or_b32_e32 v18, v19, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v15, 4, v15 ; GCN-NEXT: v_and_b32_e32 v18, 15, v18 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x7f +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x7f ; GCN-NEXT: v_or_b32_e32 v15, v18, v15 ; GCN-NEXT: v_lshrrev_b16_e64 v18, 7, s35 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x7e +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x7e ; GCN-NEXT: v_lshrrev_b16_e64 v19, 6, s35 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1121,11 +1123,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 ; GCN-NEXT: v_lshlrev_b16_e32 v18, 3, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x7d +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x7d ; GCN-NEXT: v_or_b32_e32 v18, v18, v19 ; GCN-NEXT: v_lshrrev_b16_e64 v19, 5, s35 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x7c +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x7c ; GCN-NEXT: v_lshrrev_b16_e64 v20, 4, s35 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1134,22 +1136,22 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v20, 1, v20 ; GCN-NEXT: v_or_b32_e32 v19, v20, v19 ; GCN-NEXT: v_and_b32_e32 v19, 3, v19 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x7b +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x7b ; GCN-NEXT: v_or_b32_e32 v18, v19, v18 ; GCN-NEXT: v_lshrrev_b16_e64 v19, 3, s35 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x7a +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x7a ; GCN-NEXT: v_lshrrev_b16_e64 v20, 2, s35 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc ; GCN-NEXT: v_and_b32_e32 v20, 1, v20 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x78 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x78 ; GCN-NEXT: v_mov_b32_e32 v13, s35 ; GCN-NEXT: v_lshlrev_b16_e32 v19, 3, v19 ; GCN-NEXT: v_lshlrev_b16_e32 v20, 2, v20 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x79 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x79 ; GCN-NEXT: v_or_b32_e32 v19, v19, v20 ; GCN-NEXT: v_lshrrev_b16_e64 v20, 1, s35 ; GCN-NEXT: v_cndmask_b32_e32 v13, 1, v13, vcc @@ -1164,11 +1166,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_lshlrev_b16_e32 v18, 12, v18 ; GCN-NEXT: v_and_b32_sdwa v19, v19, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GCN-NEXT: v_or_b32_e32 v18, v18, v19 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x6f +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x6f ; GCN-NEXT: v_or_b32_sdwa v15, v15, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GCN-NEXT: v_lshrrev_b16_e64 v18, 15, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x6e +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x6e ; GCN-NEXT: v_lshrrev_b16_e64 v19, 14, s7 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1176,11 +1178,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 ; GCN-NEXT: v_lshlrev_b16_e32 v18, 3, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x6d +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x6d ; GCN-NEXT: v_or_b32_e32 v18, v18, v19 ; GCN-NEXT: v_lshrrev_b16_e64 v19, 13, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x6c +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x6c ; GCN-NEXT: v_lshrrev_b16_e64 v20, 12, s7 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1189,11 +1191,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v20, 1, v20 ; GCN-NEXT: v_or_b32_e32 v19, v20, v19 ; GCN-NEXT: v_and_b32_e32 v19, 3, v19 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x6b +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x6b ; GCN-NEXT: v_or_b32_e32 v18, v19, v18 ; GCN-NEXT: v_lshrrev_b16_e64 v19, 11, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x6a +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x6a ; GCN-NEXT: v_lshrrev_b16_e64 v20, 10, s7 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1201,11 +1203,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v20, 1, v20 ; GCN-NEXT: v_lshlrev_b16_e32 v19, 3, v19 ; GCN-NEXT: v_lshlrev_b16_e32 v20, 2, v20 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x69 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x69 ; GCN-NEXT: v_or_b32_e32 v19, v19, v20 ; GCN-NEXT: v_lshrrev_b16_e64 v20, 9, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x68 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x68 ; GCN-NEXT: v_lshrrev_b16_e64 v17, 8, s7 ; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1217,11 +1219,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_or_b32_e32 v17, v17, v19 ; GCN-NEXT: v_lshlrev_b16_e32 v18, 12, v18 ; GCN-NEXT: v_and_b32_sdwa v17, v17, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x67 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x67 ; GCN-NEXT: v_or_b32_e32 v17, v18, v17 ; GCN-NEXT: v_lshrrev_b16_e64 v18, 7, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x66 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x66 ; GCN-NEXT: v_lshrrev_b16_e64 v19, 6, s7 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1229,11 +1231,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 ; GCN-NEXT: v_lshlrev_b16_e32 v18, 3, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x65 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x65 ; GCN-NEXT: v_or_b32_e32 v18, v18, v19 ; GCN-NEXT: v_lshrrev_b16_e64 v19, 5, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x64 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x64 ; GCN-NEXT: v_lshrrev_b16_e64 v20, 4, s7 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1242,11 +1244,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v20, 1, v20 ; GCN-NEXT: v_or_b32_e32 v19, v20, v19 ; GCN-NEXT: v_and_b32_e32 v19, 3, v19 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x63 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x63 ; GCN-NEXT: v_or_b32_e32 v18, v19, v18 ; GCN-NEXT: v_lshrrev_b16_e64 v19, 3, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x62 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x62 ; GCN-NEXT: v_lshrrev_b16_e64 v20, 2, s7 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1254,11 +1256,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v20, 1, v20 ; GCN-NEXT: v_lshlrev_b16_e32 v19, 3, v19 ; GCN-NEXT: v_lshlrev_b16_e32 v20, 2, v20 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x61 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x61 ; GCN-NEXT: v_or_b32_e32 v19, v19, v20 ; GCN-NEXT: v_lshrrev_b16_e64 v20, 1, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x60 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x60 ; GCN-NEXT: v_mov_b32_e32 v16, s7 ; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1271,11 +1273,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_lshlrev_b16_e32 v18, 4, v18 ; GCN-NEXT: v_and_b32_e32 v16, 15, v16 ; GCN-NEXT: v_or_b32_e32 v16, v16, v18 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x57 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x57 ; GCN-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GCN-NEXT: v_mov_b32_e32 v17, s34 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x56 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x56 ; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc ; GCN-NEXT: v_mov_b32_e32 v18, s33 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1283,11 +1285,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v18, 1, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v17, 3, v17 ; GCN-NEXT: v_lshlrev_b16_e32 v18, 2, v18 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x55 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x55 ; GCN-NEXT: v_or_b32_e32 v17, v17, v18 ; GCN-NEXT: v_mov_b32_e32 v18, s31 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x54 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x54 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc ; GCN-NEXT: v_mov_b32_e32 v19, s30 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1296,11 +1298,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 ; GCN-NEXT: v_or_b32_e32 v18, v19, v18 ; GCN-NEXT: v_and_b32_e32 v18, 3, v18 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x53 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x53 ; GCN-NEXT: v_or_b32_e32 v17, v18, v17 ; GCN-NEXT: v_mov_b32_e32 v18, s29 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x52 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x52 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc ; GCN-NEXT: v_mov_b32_e32 v19, s28 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1308,11 +1310,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 ; GCN-NEXT: v_lshlrev_b16_e32 v18, 3, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x51 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x51 ; GCN-NEXT: v_or_b32_e32 v18, v18, v19 ; GCN-NEXT: v_mov_b32_e32 v19, s27 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x50 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x50 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc ; GCN-NEXT: v_mov_b32_e32 v20, s26 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1324,11 +1326,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_or_b32_e32 v18, v19, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v17, 4, v17 ; GCN-NEXT: v_and_b32_e32 v18, 15, v18 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x5f +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x5f ; GCN-NEXT: v_or_b32_e32 v17, v18, v17 ; GCN-NEXT: v_lshrrev_b16_e64 v18, 7, s25 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x5e +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x5e ; GCN-NEXT: v_lshrrev_b16_e64 v19, 6, s25 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1336,11 +1338,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 ; GCN-NEXT: v_lshlrev_b16_e32 v18, 3, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x5d +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x5d ; GCN-NEXT: v_or_b32_e32 v18, v18, v19 ; GCN-NEXT: v_lshrrev_b16_e64 v19, 5, s25 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x5c +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x5c ; GCN-NEXT: v_lshrrev_b16_e64 v20, 4, s25 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1349,22 +1351,22 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v20, 1, v20 ; GCN-NEXT: v_or_b32_e32 v19, v20, v19 ; GCN-NEXT: v_and_b32_e32 v19, 3, v19 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x5b +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x5b ; GCN-NEXT: v_or_b32_e32 v18, v19, v18 ; GCN-NEXT: v_lshrrev_b16_e64 v19, 3, s25 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x5a +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x5a ; GCN-NEXT: v_lshrrev_b16_e64 v20, 2, s25 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc ; GCN-NEXT: v_and_b32_e32 v20, 1, v20 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x58 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x58 ; GCN-NEXT: v_mov_b32_e32 v3, s25 ; GCN-NEXT: v_lshlrev_b16_e32 v19, 3, v19 ; GCN-NEXT: v_lshlrev_b16_e32 v20, 2, v20 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x59 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x59 ; GCN-NEXT: v_or_b32_e32 v19, v19, v20 ; GCN-NEXT: v_lshrrev_b16_e64 v20, 1, s25 ; GCN-NEXT: v_cndmask_b32_e32 v3, 1, v3, vcc @@ -1378,11 +1380,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_lshlrev_b16_e32 v18, 12, v18 ; GCN-NEXT: v_and_b32_sdwa v3, v3, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GCN-NEXT: v_or_b32_e32 v3, v18, v3 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x4f +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x4f ; GCN-NEXT: v_or_b32_sdwa v17, v17, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GCN-NEXT: v_lshrrev_b16_e64 v3, 15, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x4e +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x4e ; GCN-NEXT: v_lshrrev_b16_e64 v18, 14, s6 ; GCN-NEXT: v_cndmask_b32_e32 v3, 1, v3, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1390,11 +1392,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v18, 1, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v3, 3, v3 ; GCN-NEXT: v_lshlrev_b16_e32 v18, 2, v18 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x4d +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x4d ; GCN-NEXT: v_or_b32_e32 v3, v3, v18 ; GCN-NEXT: v_lshrrev_b16_e64 v18, 13, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x4c +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x4c ; GCN-NEXT: v_lshrrev_b16_e64 v19, 12, s6 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1403,11 +1405,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 ; GCN-NEXT: v_or_b32_e32 v18, v19, v18 ; GCN-NEXT: v_and_b32_e32 v18, 3, v18 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x4b +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x4b ; GCN-NEXT: v_or_b32_e32 v3, v18, v3 ; GCN-NEXT: v_lshrrev_b16_e64 v18, 11, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x4a +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x4a ; GCN-NEXT: v_lshrrev_b16_e64 v19, 10, s6 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1415,11 +1417,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 ; GCN-NEXT: v_lshlrev_b16_e32 v18, 3, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x49 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x49 ; GCN-NEXT: v_or_b32_e32 v18, v18, v19 ; GCN-NEXT: v_lshrrev_b16_e64 v19, 9, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x48 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x48 ; GCN-NEXT: v_lshrrev_b16_e64 v20, 8, s6 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1431,11 +1433,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_or_b32_e32 v18, v19, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v3, 12, v3 ; GCN-NEXT: v_and_b32_sdwa v18, v18, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x47 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x47 ; GCN-NEXT: v_or_b32_e32 v18, v3, v18 ; GCN-NEXT: v_lshrrev_b16_e64 v3, 7, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x46 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x46 ; GCN-NEXT: v_lshrrev_b16_e64 v19, 6, s6 ; GCN-NEXT: v_cndmask_b32_e32 v3, 1, v3, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1443,11 +1445,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 ; GCN-NEXT: v_lshlrev_b16_e32 v3, 3, v3 ; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x45 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x45 ; GCN-NEXT: v_or_b32_e32 v3, v3, v19 ; GCN-NEXT: v_lshrrev_b16_e64 v19, 5, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x44 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x44 ; GCN-NEXT: v_lshrrev_b16_e64 v20, 4, s6 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1456,11 +1458,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v20, 1, v20 ; GCN-NEXT: v_or_b32_e32 v19, v20, v19 ; GCN-NEXT: v_and_b32_e32 v19, 3, v19 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x43 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x43 ; GCN-NEXT: v_or_b32_e32 v19, v19, v3 ; GCN-NEXT: v_lshrrev_b16_e64 v3, 3, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x42 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x42 ; GCN-NEXT: v_lshrrev_b16_e64 v20, 2, s6 ; GCN-NEXT: v_cndmask_b32_e32 v3, 1, v3, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1468,11 +1470,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v20, 1, v20 ; GCN-NEXT: v_lshlrev_b16_e32 v3, 3, v3 ; GCN-NEXT: v_lshlrev_b16_e32 v20, 2, v20 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x41 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x41 ; GCN-NEXT: v_or_b32_e32 v3, v3, v20 ; GCN-NEXT: v_lshrrev_b16_e64 v20, 1, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 64 +; GCN-NEXT: s_cmp_lg_u32 s2, 64 ; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1485,11 +1487,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_or_b32_sdwa v3, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GCN-NEXT: v_lshlrev_b16_e32 v15, 4, v19 ; GCN-NEXT: v_and_b32_e32 v2, 15, v2 -; GCN-NEXT: s_cmp_lg_u32 s0, 55 +; GCN-NEXT: s_cmp_lg_u32 s2, 55 ; GCN-NEXT: v_or_b32_e32 v2, v2, v15 ; GCN-NEXT: v_mov_b32_e32 v15, s24 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 54 +; GCN-NEXT: s_cmp_lg_u32 s2, 54 ; GCN-NEXT: v_cndmask_b32_e32 v15, 1, v15, vcc ; GCN-NEXT: v_mov_b32_e32 v16, s23 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1497,12 +1499,12 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v16, 1, v16 ; GCN-NEXT: v_lshlrev_b16_e32 v15, 3, v15 ; GCN-NEXT: v_lshlrev_b16_e32 v16, 2, v16 -; GCN-NEXT: s_cmp_lg_u32 s0, 53 +; GCN-NEXT: s_cmp_lg_u32 s2, 53 ; GCN-NEXT: v_or_b32_sdwa v2, v2, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GCN-NEXT: v_or_b32_e32 v15, v15, v16 ; GCN-NEXT: v_mov_b32_e32 v16, s22 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 52 +; GCN-NEXT: s_cmp_lg_u32 s2, 52 ; GCN-NEXT: v_or_b32_sdwa v2, v2, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc ; GCN-NEXT: v_mov_b32_e32 v17, s21 @@ -1512,11 +1514,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v17, 1, v17 ; GCN-NEXT: v_or_b32_e32 v16, v17, v16 ; GCN-NEXT: v_and_b32_e32 v16, 3, v16 -; GCN-NEXT: s_cmp_lg_u32 s0, 51 +; GCN-NEXT: s_cmp_lg_u32 s2, 51 ; GCN-NEXT: v_or_b32_e32 v15, v16, v15 ; GCN-NEXT: v_mov_b32_e32 v16, s20 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 50 +; GCN-NEXT: s_cmp_lg_u32 s2, 50 ; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc ; GCN-NEXT: v_mov_b32_e32 v17, s19 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1524,11 +1526,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v17, 1, v17 ; GCN-NEXT: v_lshlrev_b16_e32 v16, 3, v16 ; GCN-NEXT: v_lshlrev_b16_e32 v17, 2, v17 -; GCN-NEXT: s_cmp_lg_u32 s0, 49 +; GCN-NEXT: s_cmp_lg_u32 s2, 49 ; GCN-NEXT: v_or_b32_e32 v16, v16, v17 ; GCN-NEXT: v_mov_b32_e32 v17, s18 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 48 +; GCN-NEXT: s_cmp_lg_u32 s2, 48 ; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc ; GCN-NEXT: v_mov_b32_e32 v18, s17 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1540,11 +1542,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_or_b32_e32 v16, v17, v16 ; GCN-NEXT: v_lshlrev_b16_e32 v15, 4, v15 ; GCN-NEXT: v_and_b32_e32 v16, 15, v16 -; GCN-NEXT: s_cmp_lg_u32 s0, 63 +; GCN-NEXT: s_cmp_lg_u32 s2, 63 ; GCN-NEXT: v_or_b32_e32 v15, v16, v15 ; GCN-NEXT: v_lshrrev_b16_e64 v16, 7, s16 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 62 +; GCN-NEXT: s_cmp_lg_u32 s2, 62 ; GCN-NEXT: v_lshrrev_b16_e64 v17, 6, s16 ; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1552,11 +1554,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v17, 1, v17 ; GCN-NEXT: v_lshlrev_b16_e32 v16, 3, v16 ; GCN-NEXT: v_lshlrev_b16_e32 v17, 2, v17 -; GCN-NEXT: s_cmp_lg_u32 s0, 61 +; GCN-NEXT: s_cmp_lg_u32 s2, 61 ; GCN-NEXT: v_or_b32_e32 v16, v16, v17 ; GCN-NEXT: v_lshrrev_b16_e64 v17, 5, s16 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 60 +; GCN-NEXT: s_cmp_lg_u32 s2, 60 ; GCN-NEXT: v_lshrrev_b16_e64 v18, 4, s16 ; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1565,22 +1567,22 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v18, 1, v18 ; GCN-NEXT: v_or_b32_e32 v17, v18, v17 ; GCN-NEXT: v_and_b32_e32 v17, 3, v17 -; GCN-NEXT: s_cmp_lg_u32 s0, 59 +; GCN-NEXT: s_cmp_lg_u32 s2, 59 ; GCN-NEXT: v_or_b32_e32 v16, v17, v16 ; GCN-NEXT: v_lshrrev_b16_e64 v17, 3, s16 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 58 +; GCN-NEXT: s_cmp_lg_u32 s2, 58 ; GCN-NEXT: v_lshrrev_b16_e64 v18, 2, s16 ; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc ; GCN-NEXT: v_and_b32_e32 v18, 1, v18 -; GCN-NEXT: s_cmp_lg_u32 s0, 56 +; GCN-NEXT: s_cmp_lg_u32 s2, 56 ; GCN-NEXT: v_mov_b32_e32 v14, s16 ; GCN-NEXT: v_lshlrev_b16_e32 v17, 3, v17 ; GCN-NEXT: v_lshlrev_b16_e32 v18, 2, v18 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 57 +; GCN-NEXT: s_cmp_lg_u32 s2, 57 ; GCN-NEXT: v_or_b32_e32 v17, v17, v18 ; GCN-NEXT: v_lshrrev_b16_e64 v18, 1, s16 ; GCN-NEXT: v_cndmask_b32_e32 v14, 1, v14, vcc @@ -1594,11 +1596,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_lshlrev_b16_e32 v16, 12, v16 ; GCN-NEXT: v_and_b32_sdwa v14, v14, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GCN-NEXT: v_or_b32_e32 v14, v16, v14 -; GCN-NEXT: s_cmp_lg_u32 s0, 47 +; GCN-NEXT: s_cmp_lg_u32 s2, 47 ; GCN-NEXT: v_or_b32_sdwa v15, v15, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GCN-NEXT: v_lshrrev_b16_e64 v14, 15, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 46 +; GCN-NEXT: s_cmp_lg_u32 s2, 46 ; GCN-NEXT: v_lshrrev_b16_e64 v16, 14, s5 ; GCN-NEXT: v_cndmask_b32_e32 v14, 1, v14, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1606,11 +1608,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v16, 1, v16 ; GCN-NEXT: v_lshlrev_b16_e32 v14, 3, v14 ; GCN-NEXT: v_lshlrev_b16_e32 v16, 2, v16 -; GCN-NEXT: s_cmp_lg_u32 s0, 45 +; GCN-NEXT: s_cmp_lg_u32 s2, 45 ; GCN-NEXT: v_or_b32_e32 v14, v14, v16 ; GCN-NEXT: v_lshrrev_b16_e64 v16, 13, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 44 +; GCN-NEXT: s_cmp_lg_u32 s2, 44 ; GCN-NEXT: v_lshrrev_b16_e64 v17, 12, s5 ; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1619,11 +1621,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v17, 1, v17 ; GCN-NEXT: v_or_b32_e32 v16, v17, v16 ; GCN-NEXT: v_and_b32_e32 v16, 3, v16 -; GCN-NEXT: s_cmp_lg_u32 s0, 43 +; GCN-NEXT: s_cmp_lg_u32 s2, 43 ; GCN-NEXT: v_or_b32_e32 v14, v16, v14 ; GCN-NEXT: v_lshrrev_b16_e64 v16, 11, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 42 +; GCN-NEXT: s_cmp_lg_u32 s2, 42 ; GCN-NEXT: v_lshrrev_b16_e64 v17, 10, s5 ; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1631,11 +1633,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v17, 1, v17 ; GCN-NEXT: v_lshlrev_b16_e32 v16, 3, v16 ; GCN-NEXT: v_lshlrev_b16_e32 v17, 2, v17 -; GCN-NEXT: s_cmp_lg_u32 s0, 41 +; GCN-NEXT: s_cmp_lg_u32 s2, 41 ; GCN-NEXT: v_or_b32_e32 v16, v16, v17 ; GCN-NEXT: v_lshrrev_b16_e64 v17, 9, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 40 +; GCN-NEXT: s_cmp_lg_u32 s2, 40 ; GCN-NEXT: v_lshrrev_b16_e64 v18, 8, s5 ; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1647,11 +1649,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_or_b32_e32 v16, v17, v16 ; GCN-NEXT: v_lshlrev_b16_e32 v14, 12, v14 ; GCN-NEXT: v_and_b32_sdwa v16, v16, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GCN-NEXT: s_cmp_lg_u32 s0, 39 +; GCN-NEXT: s_cmp_lg_u32 s2, 39 ; GCN-NEXT: v_or_b32_e32 v16, v14, v16 ; GCN-NEXT: v_lshrrev_b16_e64 v14, 7, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 38 +; GCN-NEXT: s_cmp_lg_u32 s2, 38 ; GCN-NEXT: v_lshrrev_b16_e64 v17, 6, s5 ; GCN-NEXT: v_cndmask_b32_e32 v14, 1, v14, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1659,11 +1661,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v17, 1, v17 ; GCN-NEXT: v_lshlrev_b16_e32 v14, 3, v14 ; GCN-NEXT: v_lshlrev_b16_e32 v17, 2, v17 -; GCN-NEXT: s_cmp_lg_u32 s0, 37 +; GCN-NEXT: s_cmp_lg_u32 s2, 37 ; GCN-NEXT: v_or_b32_e32 v14, v14, v17 ; GCN-NEXT: v_lshrrev_b16_e64 v17, 5, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 36 +; GCN-NEXT: s_cmp_lg_u32 s2, 36 ; GCN-NEXT: v_lshrrev_b16_e64 v18, 4, s5 ; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1672,11 +1674,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v18, 1, v18 ; GCN-NEXT: v_or_b32_e32 v17, v18, v17 ; GCN-NEXT: v_and_b32_e32 v17, 3, v17 -; GCN-NEXT: s_cmp_lg_u32 s0, 35 +; GCN-NEXT: s_cmp_lg_u32 s2, 35 ; GCN-NEXT: v_or_b32_e32 v17, v17, v14 ; GCN-NEXT: v_lshrrev_b16_e64 v14, 3, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 34 +; GCN-NEXT: s_cmp_lg_u32 s2, 34 ; GCN-NEXT: v_lshrrev_b16_e64 v18, 2, s5 ; GCN-NEXT: v_cndmask_b32_e32 v14, 1, v14, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1684,11 +1686,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v18, 1, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v14, 3, v14 ; GCN-NEXT: v_lshlrev_b16_e32 v18, 2, v18 -; GCN-NEXT: s_cmp_lg_u32 s0, 33 +; GCN-NEXT: s_cmp_lg_u32 s2, 33 ; GCN-NEXT: v_or_b32_e32 v18, v14, v18 ; GCN-NEXT: v_lshrrev_b16_e64 v14, 1, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 32 +; GCN-NEXT: s_cmp_lg_u32 s2, 32 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: v_cndmask_b32_e32 v14, 1, v14, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1702,11 +1704,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v1, 15, v1 ; GCN-NEXT: v_or_b32_e32 v1, v1, v17 ; GCN-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NEXT: s_cmp_lg_u32 s0, 23 +; GCN-NEXT: s_cmp_lg_u32 s2, 23 ; GCN-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GCN-NEXT: v_mov_b32_e32 v15, s15 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 22 +; GCN-NEXT: s_cmp_lg_u32 s2, 22 ; GCN-NEXT: v_cndmask_b32_e32 v15, 1, v15, vcc ; GCN-NEXT: v_mov_b32_e32 v16, s14 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1714,11 +1716,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v16, 1, v16 ; GCN-NEXT: v_lshlrev_b16_e32 v15, 3, v15 ; GCN-NEXT: v_lshlrev_b16_e32 v16, 2, v16 -; GCN-NEXT: s_cmp_lg_u32 s0, 21 +; GCN-NEXT: s_cmp_lg_u32 s2, 21 ; GCN-NEXT: v_or_b32_e32 v15, v15, v16 ; GCN-NEXT: v_mov_b32_e32 v16, s13 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 20 +; GCN-NEXT: s_cmp_lg_u32 s2, 20 ; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc ; GCN-NEXT: v_mov_b32_e32 v17, s12 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1727,11 +1729,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v17, 1, v17 ; GCN-NEXT: v_or_b32_e32 v16, v17, v16 ; GCN-NEXT: v_and_b32_e32 v16, 3, v16 -; GCN-NEXT: s_cmp_lg_u32 s0, 19 +; GCN-NEXT: s_cmp_lg_u32 s2, 19 ; GCN-NEXT: v_or_b32_e32 v15, v16, v15 ; GCN-NEXT: v_mov_b32_e32 v16, s11 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 18 +; GCN-NEXT: s_cmp_lg_u32 s2, 18 ; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc ; GCN-NEXT: v_mov_b32_e32 v17, s10 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1739,11 +1741,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v17, 1, v17 ; GCN-NEXT: v_lshlrev_b16_e32 v16, 3, v16 ; GCN-NEXT: v_lshlrev_b16_e32 v17, 2, v17 -; GCN-NEXT: s_cmp_lg_u32 s0, 17 +; GCN-NEXT: s_cmp_lg_u32 s2, 17 ; GCN-NEXT: v_or_b32_e32 v16, v16, v17 ; GCN-NEXT: v_mov_b32_e32 v17, s9 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 16 +; GCN-NEXT: s_cmp_lg_u32 s2, 16 ; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc ; GCN-NEXT: v_mov_b32_e32 v19, s8 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1755,24 +1757,24 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_or_b32_e32 v16, v17, v16 ; GCN-NEXT: v_lshlrev_b16_e32 v15, 4, v15 ; GCN-NEXT: v_and_b32_e32 v16, 15, v16 -; GCN-NEXT: s_cmp_lg_u32 s0, 31 +; GCN-NEXT: s_cmp_lg_u32 s2, 31 ; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: v_lshrrev_b16_e64 v16, 7, s1 +; GCN-NEXT: v_lshrrev_b16_e64 v16, 7, s3 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 30 -; GCN-NEXT: v_lshrrev_b16_e64 v17, 6, s1 +; GCN-NEXT: s_cmp_lg_u32 s2, 30 +; GCN-NEXT: v_lshrrev_b16_e64 v17, 6, s3 ; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc ; GCN-NEXT: v_and_b32_e32 v17, 1, v17 ; GCN-NEXT: v_lshlrev_b16_e32 v16, 3, v16 ; GCN-NEXT: v_lshlrev_b16_e32 v17, 2, v17 -; GCN-NEXT: s_cmp_lg_u32 s0, 29 +; GCN-NEXT: s_cmp_lg_u32 s2, 29 ; GCN-NEXT: v_or_b32_e32 v16, v16, v17 -; GCN-NEXT: v_lshrrev_b16_e64 v17, 5, s1 +; GCN-NEXT: v_lshrrev_b16_e64 v17, 5, s3 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 28 -; GCN-NEXT: v_lshrrev_b16_e64 v19, 4, s1 +; GCN-NEXT: s_cmp_lg_u32 s2, 28 +; GCN-NEXT: v_lshrrev_b16_e64 v19, 4, s3 ; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc @@ -1780,24 +1782,24 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 ; GCN-NEXT: v_or_b32_e32 v17, v19, v17 ; GCN-NEXT: v_and_b32_e32 v17, 3, v17 -; GCN-NEXT: s_cmp_lg_u32 s0, 27 +; GCN-NEXT: s_cmp_lg_u32 s2, 27 ; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_lshrrev_b16_e64 v17, 3, s1 +; GCN-NEXT: v_lshrrev_b16_e64 v17, 3, s3 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 26 -; GCN-NEXT: v_lshrrev_b16_e64 v19, 2, s1 +; GCN-NEXT: s_cmp_lg_u32 s2, 26 +; GCN-NEXT: v_lshrrev_b16_e64 v19, 2, s3 ; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 -; GCN-NEXT: s_cmp_lg_u32 s0, 24 -; GCN-NEXT: v_mov_b32_e32 v18, s1 +; GCN-NEXT: s_cmp_lg_u32 s2, 24 +; GCN-NEXT: v_mov_b32_e32 v18, s3 ; GCN-NEXT: v_lshlrev_b16_e32 v17, 3, v17 ; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 25 +; GCN-NEXT: s_cmp_lg_u32 s2, 25 ; GCN-NEXT: v_or_b32_e32 v17, v17, v19 -; GCN-NEXT: v_lshrrev_b16_e64 v19, 1, s1 +; GCN-NEXT: v_lshrrev_b16_e64 v19, 1, s3 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc @@ -1809,11 +1811,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_lshlrev_b16_e32 v16, 12, v16 ; GCN-NEXT: v_and_b32_sdwa v17, v17, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GCN-NEXT: v_or_b32_e32 v16, v16, v17 -; GCN-NEXT: s_cmp_lg_u32 s0, 15 +; GCN-NEXT: s_cmp_lg_u32 s2, 15 ; GCN-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GCN-NEXT: v_lshrrev_b16_e64 v16, 15, s4 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 14 +; GCN-NEXT: s_cmp_lg_u32 s2, 14 ; GCN-NEXT: v_lshrrev_b16_e64 v17, 14, s4 ; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1821,11 +1823,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v17, 1, v17 ; GCN-NEXT: v_lshlrev_b16_e32 v16, 3, v16 ; GCN-NEXT: v_lshlrev_b16_e32 v17, 2, v17 -; GCN-NEXT: s_cmp_lg_u32 s0, 13 +; GCN-NEXT: s_cmp_lg_u32 s2, 13 ; GCN-NEXT: v_or_b32_e32 v16, v16, v17 ; GCN-NEXT: v_lshrrev_b16_e64 v17, 13, s4 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 12 +; GCN-NEXT: s_cmp_lg_u32 s2, 12 ; GCN-NEXT: v_lshrrev_b16_e64 v18, 12, s4 ; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1833,52 +1835,52 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_lshlrev_b16_e32 v17, 1, v17 ; GCN-NEXT: v_and_b32_e32 v18, 1, v18 ; GCN-NEXT: v_or_b32_e32 v17, v18, v17 -; GCN-NEXT: s_cmp_lg_u32 s0, 11 +; GCN-NEXT: s_cmp_lg_u32 s2, 11 ; GCN-NEXT: v_lshrrev_b16_e64 v19, 11, s4 ; GCN-NEXT: v_and_b32_e32 v17, 3, v17 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 10 +; GCN-NEXT: s_cmp_lg_u32 s2, 10 ; GCN-NEXT: v_lshrrev_b16_e64 v14, 10, s4 ; GCN-NEXT: v_or_b32_e32 v16, v17, v16 ; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v19, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 9 +; GCN-NEXT: s_cmp_lg_u32 s2, 9 ; GCN-NEXT: v_lshrrev_b16_e64 v12, 9, s4 ; GCN-NEXT: v_cndmask_b32_e32 v14, 1, v14, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 8 +; GCN-NEXT: s_cmp_lg_u32 s2, 8 ; GCN-NEXT: v_lshrrev_b16_e64 v11, 8, s4 ; GCN-NEXT: v_cndmask_b32_e32 v12, 1, v12, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 7 +; GCN-NEXT: s_cmp_lg_u32 s2, 7 ; GCN-NEXT: v_lshrrev_b16_e64 v10, 7, s4 ; GCN-NEXT: v_cndmask_b32_e32 v11, 1, v11, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 6 +; GCN-NEXT: s_cmp_lg_u32 s2, 6 ; GCN-NEXT: v_lshrrev_b16_e64 v9, 6, s4 ; GCN-NEXT: v_cndmask_b32_e32 v10, 1, v10, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 5 +; GCN-NEXT: s_cmp_lg_u32 s2, 5 ; GCN-NEXT: v_lshrrev_b16_e64 v8, 5, s4 ; GCN-NEXT: v_cndmask_b32_e32 v9, 1, v9, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 4 +; GCN-NEXT: s_cmp_lg_u32 s2, 4 ; GCN-NEXT: v_lshrrev_b16_e64 v7, 4, s4 ; GCN-NEXT: v_cndmask_b32_e32 v8, 1, v8, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 3 +; GCN-NEXT: s_cmp_lg_u32 s2, 3 ; GCN-NEXT: v_lshrrev_b16_e64 v6, 3, s4 ; GCN-NEXT: v_cndmask_b32_e32 v7, 1, v7, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 2 +; GCN-NEXT: s_cmp_lg_u32 s2, 2 ; GCN-NEXT: v_lshrrev_b16_e64 v5, 2, s4 ; GCN-NEXT: v_cndmask_b32_e32 v6, 1, v6, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 1 +; GCN-NEXT: s_cmp_lg_u32 s2, 1 ; GCN-NEXT: v_lshrrev_b16_e64 v4, 1, s4 ; GCN-NEXT: v_cndmask_b32_e32 v5, 1, v5, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-NEXT: s_cmp_lg_u32 s2, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_cndmask_b32_e32 v4, 1, v4, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1917,9 +1919,9 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_or_b32_e32 v11, v16, v11 ; GCN-NEXT: v_or_b32_e32 v0, v0, v7 ; GCN-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v5, s3 +; GCN-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v4, s2 +; GCN-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll index d5265e364a17e..17fe1721f73d7 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll @@ -130,8 +130,9 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) { ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -290,8 +291,9 @@ define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v2, v3 ; GFX12-NEXT: v_or_b32_e32 v2, -5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -436,7 +438,7 @@ entry: define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; GFX9-LABEL: udiv_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 @@ -466,7 +468,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX90A-LABEL: udiv_i32: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, 0 ; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s7 @@ -496,7 +498,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX10-LABEL: udiv_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s7 ; GFX10-NEXT: s_sub_i32 s1, 0, s7 @@ -526,7 +528,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX9-FLATSCR-LABEL: udiv_i32: ; GFX9-FLATSCR: ; %bb.0: -; GFX9-FLATSCR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-FLATSCR-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-FLATSCR-NEXT: v_cvt_f32_u32_e32 v0, s7 @@ -556,7 +558,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX11-LABEL: udiv_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s3 ; GFX11-NEXT: s_sub_i32 s5, 0, s3 @@ -593,7 +595,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX12-LABEL: udiv_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_cvt_f32_u32 s4, s3 ; GFX12-NEXT: s_sub_co_i32 s5, 0, s3 @@ -692,19 +694,19 @@ main_body: define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ; GFX9-LABEL: atomic_add_local: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB5_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s2, s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bcnt1_i32_b64 s1, s[2:3] -; GFX9-NEXT: s_mul_i32 s1, s1, 5 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-NEXT: s_mul_i32 s0, s0, 5 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: ds_add_u32 v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB5_2: @@ -712,19 +714,19 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ; ; GFX90A-LABEL: atomic_add_local: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_mov_b64 s[2:3], exec -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX90A-NEXT: s_mov_b64 s[0:1], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB5_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX90A-NEXT: s_load_dword s2, s[2:3], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_bcnt1_i32_b64 s1, s[2:3] -; GFX90A-NEXT: s_mul_i32 s1, s1, 5 -; GFX90A-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX90A-NEXT: s_mul_i32 s0, s0, 5 +; GFX90A-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NEXT: v_mov_b32_e32 v0, s2 ; GFX90A-NEXT: ds_add_u32 v0, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: .LBB5_2: @@ -732,18 +734,18 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ; ; GFX10-LABEL: atomic_add_local: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s2, exec_lo -; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX10-NEXT: s_mov_b32 s0, exec_lo +; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX10-NEXT: s_cbranch_execz .LBB5_2 ; GFX10-NEXT: ; %bb.1: -; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s1, s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bcnt1_i32_b32 s1, s2 -; GFX10-NEXT: s_mul_i32 s1, s1, 5 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: s_bcnt1_i32_b32 s0, s0 +; GFX10-NEXT: s_mul_i32 s0, s0, 5 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: v_mov_b32_e32 v0, s1 ; GFX10-NEXT: ds_add_u32 v0, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv @@ -752,19 +754,19 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ; ; GFX9-FLATSCR-LABEL: atomic_add_local: ; GFX9-FLATSCR: ; %bb.0: -; GFX9-FLATSCR-NEXT: s_mov_b64 s[2:3], exec -; GFX9-FLATSCR-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-FLATSCR-NEXT: s_mov_b64 s[0:1], exec +; GFX9-FLATSCR-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX9-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-FLATSCR-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-FLATSCR-NEXT: s_cbranch_execz .LBB5_2 ; GFX9-FLATSCR-NEXT: ; %bb.1: -; GFX9-FLATSCR-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX9-FLATSCR-NEXT: s_load_dword s2, s[2:3], 0x24 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-FLATSCR-NEXT: s_bcnt1_i32_b64 s1, s[2:3] -; GFX9-FLATSCR-NEXT: s_mul_i32 s1, s1, 5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-FLATSCR-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-FLATSCR-NEXT: s_mul_i32 s0, s0, 5 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-FLATSCR-NEXT: ds_add_u32 v0, v1 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-NEXT: .LBB5_2: @@ -772,19 +774,19 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ; ; GFX11-LABEL: atomic_add_local: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_mov_b32 s2, exec_lo -; GFX11-NEXT: s_mov_b32 s3, exec_lo -; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_mov_b32 s1, exec_lo +; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11-NEXT: s_cbranch_execz .LBB5_2 ; GFX11-NEXT: ; %bb.1: -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s1, s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_bcnt1_i32_b32 s1, s2 +; GFX11-NEXT: s_bcnt1_i32_b32 s0, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_mul_i32 s1, s1, 5 -; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0 +; GFX11-NEXT: s_mul_i32 s0, s0, 5 +; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v0, s1 ; GFX11-NEXT: ds_add_u32 v0, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv @@ -793,19 +795,20 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ; ; GFX12-LABEL: atomic_add_local: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_mov_b32 s2, exec_lo -; GFX12-NEXT: s_mov_b32 s3, exec_lo -; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: s_mov_b32 s1, exec_lo +; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12-NEXT: s_cbranch_execz .LBB5_2 ; GFX12-NEXT: ; %bb.1: -; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-NEXT: s_load_b32 s1, s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_bcnt1_i32_b32 s1, s2 +; GFX12-NEXT: s_bcnt1_i32_b32 s0, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_mul_i32 s1, s1, 5 -; GFX12-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0 +; GFX12-NEXT: s_mul_i32 s0, s0, 5 +; GFX12-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v0, s1 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: ds_add_u32 v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE @@ -874,8 +877,9 @@ define void @flat_atomic_xchg_i32_noret(ptr %ptr, i32 %in) { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -894,10 +898,10 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB7_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_mul_i32 s4, s4, 5 @@ -906,8 +910,8 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB7_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -923,10 +927,10 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: ; implicit-def: $vgpr1 -; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX90A-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB7_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX90A-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX90A-NEXT: s_mul_i32 s4, s4, 5 @@ -935,8 +939,8 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX90A-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: .LBB7_2: -; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_readfirstlane_b32 s2, v1 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -947,26 +951,26 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; ; GFX10-LABEL: atomic_add_ret_local: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s3, exec_lo +; GFX10-NEXT: s_mov_b32 s1, exec_lo ; GFX10-NEXT: ; implicit-def: $vgpr1 -; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX10-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX10-NEXT: s_cbranch_execz .LBB7_2 ; GFX10-NEXT: ; %bb.1: -; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX10-NEXT: s_mul_i32 s3, s3, 5 -; GFX10-NEXT: v_mov_b32_e32 v2, s3 +; GFX10-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX10-NEXT: s_mul_i32 s1, s1, 5 +; GFX10-NEXT: v_mov_b32_e32 v2, s1 ; GFX10-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: .LBB7_2: ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -982,10 +986,10 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX9-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-FLATSCR-NEXT: ; implicit-def: $vgpr1 -; GFX9-FLATSCR-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-FLATSCR-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-FLATSCR-NEXT: s_cbranch_execz .LBB7_2 ; GFX9-FLATSCR-NEXT: ; %bb.1: -; GFX9-FLATSCR-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX9-FLATSCR-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-FLATSCR-NEXT: s_mul_i32 s4, s4, 5 @@ -994,8 +998,8 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX9-FLATSCR-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-NEXT: .LBB7_2: -; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0 @@ -1006,26 +1010,26 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: atomic_add_ret_local: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_mov_b32 s3, exec_lo -; GFX11-NEXT: s_mov_b32 s2, exec_lo -; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX11-NEXT: s_mov_b32 s1, exec_lo +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 ; GFX11-NEXT: ; implicit-def: $vgpr1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11-NEXT: s_cbranch_execz .LBB7_2 ; GFX11-NEXT: ; %bb.1: -; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX11-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_mul_i32 s3, s3, 5 -; GFX11-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v1, s4 +; GFX11-NEXT: s_mul_i32 s1, s1, 5 +; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s4 ; GFX11-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: .LBB7_2: -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 @@ -1037,26 +1041,27 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; ; GFX12-LABEL: atomic_add_ret_local: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_mov_b32 s3, exec_lo -; GFX12-NEXT: s_mov_b32 s2, exec_lo -; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX12-NEXT: s_mov_b32 s1, exec_lo +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 ; GFX12-NEXT: ; implicit-def: $vgpr1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12-NEXT: s_cbranch_execz .LBB7_2 ; GFX12-NEXT: ; %bb.1: -; GFX12-NEXT: s_load_b32 s4, s[0:1], 0x2c +; GFX12-NEXT: s_load_b32 s4, s[2:3], 0x2c ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX12-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_mul_i32 s3, s3, 5 -; GFX12-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v1, s4 +; GFX12-NEXT: s_mul_i32 s1, s1, 5 +; GFX12-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s4 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: .LBB7_2: -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 @@ -1083,10 +1088,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB8_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_mul_i32 s4, s4, 5 @@ -1094,8 +1099,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB8_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1111,10 +1116,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: ; implicit-def: $vgpr1 -; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX90A-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB8_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX90A-NEXT: s_mul_i32 s4, s4, 5 @@ -1122,8 +1127,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX90A-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: .LBB8_2: -; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_readfirstlane_b32 s2, v1 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -1134,24 +1139,24 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX10-LABEL: add_i32_constant: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_mov_b32 s3, exec_lo +; GFX10-NEXT: s_mov_b32 s1, exec_lo ; GFX10-NEXT: ; implicit-def: $vgpr1 -; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX10-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX10-NEXT: s_cbranch_execz .LBB8_2 ; GFX10-NEXT: ; %bb.1: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX10-NEXT: s_mul_i32 s3, s3, 5 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX10-NEXT: s_mul_i32 s1, s1, 5 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB8_2: ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -1167,10 +1172,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-FLATSCR-NEXT: ; implicit-def: $vgpr1 -; GFX9-FLATSCR-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-FLATSCR-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-FLATSCR-NEXT: s_cbranch_execz .LBB8_2 ; GFX9-FLATSCR-NEXT: ; %bb.1: -; GFX9-FLATSCR-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX9-FLATSCR-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-FLATSCR-NEXT: s_mul_i32 s4, s4, 5 @@ -1178,8 +1183,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-FLATSCR-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: .LBB8_2: -; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0 @@ -1190,25 +1195,25 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX11-LABEL: add_i32_constant: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_mov_b32 s3, exec_lo -; GFX11-NEXT: s_mov_b32 s2, exec_lo -; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX11-NEXT: s_mov_b32 s1, exec_lo +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 ; GFX11-NEXT: ; implicit-def: $vgpr1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11-NEXT: s_cbranch_execz .LBB8_2 ; GFX11-NEXT: ; %bb.1: -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX11-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_mul_i32 s3, s3, 5 -; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: s_mul_i32 s1, s1, 5 +; GFX11-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], 0 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB8_2: -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 @@ -1220,25 +1225,25 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX12-LABEL: add_i32_constant: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_mov_b32 s3, exec_lo -; GFX12-NEXT: s_mov_b32 s2, exec_lo -; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX12-NEXT: s_mov_b32 s1, exec_lo +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 ; GFX12-NEXT: ; implicit-def: $vgpr1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12-NEXT: s_cbranch_execz .LBB8_2 ; GFX12-NEXT: ; %bb.1: -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX12-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_mul_i32 s3, s3, 5 -; GFX12-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-NEXT: s_mul_i32 s1, s1, 5 +; GFX12-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: .LBB8_2: -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll index 69f181fcede30..f9073be7e260b 100644 --- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll @@ -8,11 +8,11 @@ define amdgpu_kernel void @i8_arg(ptr addrspace(1) nocapture %out, i8 %in) nounwind { ; SI-LABEL: i8_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s4, s2, 0xff +; SI-NEXT: s_and_b32 s4, s4, 0xff ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -20,10 +20,10 @@ define amdgpu_kernel void @i8_arg(ptr addrspace(1) nocapture %out, i8 %in) nounw ; ; VI-LABEL: i8_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s2, 0xff +; VI-NEXT: s_and_b32 s2, s4, 0xff ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -32,8 +32,8 @@ define amdgpu_kernel void @i8_arg(ptr addrspace(1) nocapture %out, i8 %in) nounw ; ; GFX9-LABEL: i8_arg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0xff @@ -80,11 +80,11 @@ define amdgpu_kernel void @i8_arg(ptr addrspace(1) nocapture %out, i8 %in) nounw define amdgpu_kernel void @i8_zext_arg(ptr addrspace(1) nocapture %out, i8 zeroext %in) nounwind { ; SI-LABEL: i8_zext_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s4, s2, 0xff +; SI-NEXT: s_and_b32 s4, s4, 0xff ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -92,10 +92,10 @@ define amdgpu_kernel void @i8_zext_arg(ptr addrspace(1) nocapture %out, i8 zeroe ; ; VI-LABEL: i8_zext_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s2, 0xff +; VI-NEXT: s_and_b32 s2, s4, 0xff ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -104,8 +104,8 @@ define amdgpu_kernel void @i8_zext_arg(ptr addrspace(1) nocapture %out, i8 zeroe ; ; GFX9-LABEL: i8_zext_arg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0xff @@ -155,11 +155,11 @@ define amdgpu_kernel void @i8_zext_arg(ptr addrspace(1) nocapture %out, i8 zeroe define amdgpu_kernel void @i8_sext_arg(ptr addrspace(1) nocapture %out, i8 signext %in) nounwind { ; SI-LABEL: i8_sext_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_sext_i32_i8 s4, s2 +; SI-NEXT: s_sext_i32_i8 s4, s4 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -167,10 +167,10 @@ define amdgpu_kernel void @i8_sext_arg(ptr addrspace(1) nocapture %out, i8 signe ; ; VI-LABEL: i8_sext_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_sext_i32_i8 s2, s2 +; VI-NEXT: s_sext_i32_i8 s2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -179,8 +179,8 @@ define amdgpu_kernel void @i8_sext_arg(ptr addrspace(1) nocapture %out, i8 signe ; ; GFX9-LABEL: i8_sext_arg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sext_i32_i8 s2, s2 @@ -230,11 +230,11 @@ define amdgpu_kernel void @i8_sext_arg(ptr addrspace(1) nocapture %out, i8 signe define amdgpu_kernel void @i16_arg(ptr addrspace(1) nocapture %out, i16 %in) nounwind { ; SI-LABEL: i16_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s4, s2, 0xffff +; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -242,10 +242,10 @@ define amdgpu_kernel void @i16_arg(ptr addrspace(1) nocapture %out, i16 %in) nou ; ; VI-LABEL: i16_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s2, 0xffff +; VI-NEXT: s_and_b32 s2, s4, 0xffff ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -254,8 +254,8 @@ define amdgpu_kernel void @i16_arg(ptr addrspace(1) nocapture %out, i16 %in) nou ; ; GFX9-LABEL: i16_arg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0xffff @@ -302,11 +302,11 @@ define amdgpu_kernel void @i16_arg(ptr addrspace(1) nocapture %out, i16 %in) nou define amdgpu_kernel void @i16_zext_arg(ptr addrspace(1) nocapture %out, i16 zeroext %in) nounwind { ; SI-LABEL: i16_zext_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s4, s2, 0xffff +; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -314,10 +314,10 @@ define amdgpu_kernel void @i16_zext_arg(ptr addrspace(1) nocapture %out, i16 zer ; ; VI-LABEL: i16_zext_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s2, 0xffff +; VI-NEXT: s_and_b32 s2, s4, 0xffff ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -326,8 +326,8 @@ define amdgpu_kernel void @i16_zext_arg(ptr addrspace(1) nocapture %out, i16 zer ; ; GFX9-LABEL: i16_zext_arg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0xffff @@ -377,11 +377,11 @@ define amdgpu_kernel void @i16_zext_arg(ptr addrspace(1) nocapture %out, i16 zer define amdgpu_kernel void @i16_sext_arg(ptr addrspace(1) nocapture %out, i16 signext %in) nounwind { ; SI-LABEL: i16_sext_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_sext_i32_i16 s4, s2 +; SI-NEXT: s_sext_i32_i16 s4, s4 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -389,10 +389,10 @@ define amdgpu_kernel void @i16_sext_arg(ptr addrspace(1) nocapture %out, i16 sig ; ; VI-LABEL: i16_sext_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_sext_i32_i16 s2, s2 +; VI-NEXT: s_sext_i32_i16 s2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -401,8 +401,8 @@ define amdgpu_kernel void @i16_sext_arg(ptr addrspace(1) nocapture %out, i16 sig ; ; GFX9-LABEL: i16_sext_arg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sext_i32_i16 s2, s2 @@ -452,8 +452,8 @@ define amdgpu_kernel void @i16_sext_arg(ptr addrspace(1) nocapture %out, i16 sig define amdgpu_kernel void @i32_arg(ptr addrspace(1) nocapture %out, i32 %in) nounwind { ; SI-LABEL: i32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -463,19 +463,19 @@ define amdgpu_kernel void @i32_arg(ptr addrspace(1) nocapture %out, i32 %in) nou ; ; VI-LABEL: i32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: i32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -511,8 +511,8 @@ entry: define amdgpu_kernel void @f32_arg(ptr addrspace(1) nocapture %out, float %in) nounwind { ; SI-LABEL: f32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -522,19 +522,19 @@ define amdgpu_kernel void @f32_arg(ptr addrspace(1) nocapture %out, float %in) n ; ; VI-LABEL: f32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: f32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -570,8 +570,8 @@ entry: define amdgpu_kernel void @v2i8_arg(ptr addrspace(1) %out, <2 x i8> %in) { ; SI-LABEL: v2i8_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -581,19 +581,19 @@ define amdgpu_kernel void @v2i8_arg(ptr addrspace(1) %out, <2 x i8> %in) { ; ; VI-LABEL: v2i8_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v2i8_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -659,8 +659,8 @@ entry: define amdgpu_kernel void @v2i16_arg(ptr addrspace(1) %out, <2 x i16> %in) { ; SI-LABEL: v2i16_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -670,19 +670,19 @@ define amdgpu_kernel void @v2i16_arg(ptr addrspace(1) %out, <2 x i16> %in) { ; ; VI-LABEL: v2i16_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v2i16_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -718,7 +718,7 @@ entry: define amdgpu_kernel void @v2i32_arg(ptr addrspace(1) nocapture %out, <2 x i32> %in) nounwind { ; SI-LABEL: v2i32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -731,7 +731,7 @@ define amdgpu_kernel void @v2i32_arg(ptr addrspace(1) nocapture %out, <2 x i32> ; ; VI-LABEL: v2i32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -742,7 +742,7 @@ define amdgpu_kernel void @v2i32_arg(ptr addrspace(1) nocapture %out, <2 x i32> ; ; GFX9-LABEL: v2i32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -781,7 +781,7 @@ entry: define amdgpu_kernel void @v2f32_arg(ptr addrspace(1) nocapture %out, <2 x float> %in) nounwind { ; SI-LABEL: v2f32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -794,7 +794,7 @@ define amdgpu_kernel void @v2f32_arg(ptr addrspace(1) nocapture %out, <2 x float ; ; VI-LABEL: v2f32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -805,7 +805,7 @@ define amdgpu_kernel void @v2f32_arg(ptr addrspace(1) nocapture %out, <2 x float ; ; GFX9-LABEL: v2f32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -844,8 +844,8 @@ entry: define amdgpu_kernel void @v3i8_arg(ptr addrspace(1) nocapture %out, <3 x i8> %in) nounwind { ; SI-LABEL: v3i8_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshr_b32 s5, s4, 16 @@ -858,26 +858,26 @@ define amdgpu_kernel void @v3i8_arg(ptr addrspace(1) nocapture %out, <3 x i8> %i ; ; VI-LABEL: v3i8_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s3, s2, 16 +; VI-NEXT: s_lshr_b32 s2, s4, 16 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_add_u32 s0, s0, 2 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: v_mov_b32_e32 v5, s2 ; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: flat_store_byte v[2:3], v5 ; VI-NEXT: flat_store_short v[0:1], v4 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v3i8_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -983,7 +983,7 @@ entry: define amdgpu_kernel void @v3i16_arg(ptr addrspace(1) nocapture %out, <3 x i16> %in) nounwind { ; SI-LABEL: v3i16_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -998,7 +998,7 @@ define amdgpu_kernel void @v3i16_arg(ptr addrspace(1) nocapture %out, <3 x i16> ; ; VI-LABEL: v3i16_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s0, 4 ; VI-NEXT: s_addc_u32 s5, s1, 0 @@ -1014,7 +1014,7 @@ define amdgpu_kernel void @v3i16_arg(ptr addrspace(1) nocapture %out, <3 x i16> ; ; GFX9-LABEL: v3i16_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 @@ -1102,8 +1102,8 @@ entry: define amdgpu_kernel void @v3i32_arg(ptr addrspace(1) nocapture %out, <3 x i32> %in) nounwind { ; SI-LABEL: v3i32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1117,8 +1117,8 @@ define amdgpu_kernel void @v3i32_arg(ptr addrspace(1) nocapture %out, <3 x i32> ; ; VI-LABEL: v3i32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v4, s1 @@ -1130,14 +1130,14 @@ define amdgpu_kernel void @v3i32_arg(ptr addrspace(1) nocapture %out, <3 x i32> ; ; GFX9-LABEL: v3i32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; GFX9-NEXT: global_store_dwordx3 v3, v[0:2], s[4:5] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v3i32_arg: @@ -1181,8 +1181,8 @@ entry: define amdgpu_kernel void @v3f32_arg(ptr addrspace(1) nocapture %out, <3 x float> %in) nounwind { ; SI-LABEL: v3f32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1196,8 +1196,8 @@ define amdgpu_kernel void @v3f32_arg(ptr addrspace(1) nocapture %out, <3 x float ; ; VI-LABEL: v3f32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v4, s1 @@ -1209,14 +1209,14 @@ define amdgpu_kernel void @v3f32_arg(ptr addrspace(1) nocapture %out, <3 x float ; ; GFX9-LABEL: v3f32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; GFX9-NEXT: global_store_dwordx3 v3, v[0:2], s[4:5] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v3f32_arg: @@ -1260,8 +1260,8 @@ entry: define amdgpu_kernel void @v4i8_arg(ptr addrspace(1) %out, <4 x i8> %in) { ; SI-LABEL: v4i8_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1271,19 +1271,19 @@ define amdgpu_kernel void @v4i8_arg(ptr addrspace(1) %out, <4 x i8> %in) { ; ; VI-LABEL: v4i8_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v4i8_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -1319,7 +1319,7 @@ entry: define amdgpu_kernel void @v4i16_arg(ptr addrspace(1) %out, <4 x i16> %in) { ; SI-LABEL: v4i16_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1332,7 +1332,7 @@ define amdgpu_kernel void @v4i16_arg(ptr addrspace(1) %out, <4 x i16> %in) { ; ; VI-LABEL: v4i16_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -1343,7 +1343,7 @@ define amdgpu_kernel void @v4i16_arg(ptr addrspace(1) %out, <4 x i16> %in) { ; ; GFX9-LABEL: v4i16_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -1382,8 +1382,8 @@ entry: define amdgpu_kernel void @v4i32_arg(ptr addrspace(1) nocapture %out, <4 x i32> %in) nounwind { ; SI-LABEL: v4i32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1396,8 +1396,8 @@ define amdgpu_kernel void @v4i32_arg(ptr addrspace(1) nocapture %out, <4 x i32> ; ; VI-LABEL: v4i32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1410,15 +1410,15 @@ define amdgpu_kernel void @v4i32_arg(ptr addrspace(1) nocapture %out, <4 x i32> ; ; GFX9-LABEL: v4i32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v4i32_arg: @@ -1456,8 +1456,8 @@ entry: define amdgpu_kernel void @v4f32_arg(ptr addrspace(1) nocapture %out, <4 x float> %in) nounwind { ; SI-LABEL: v4f32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1470,8 +1470,8 @@ define amdgpu_kernel void @v4f32_arg(ptr addrspace(1) nocapture %out, <4 x float ; ; VI-LABEL: v4f32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1484,15 +1484,15 @@ define amdgpu_kernel void @v4f32_arg(ptr addrspace(1) nocapture %out, <4 x float ; ; GFX9-LABEL: v4f32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v4f32_arg: @@ -1530,7 +1530,7 @@ entry: define amdgpu_kernel void @v5i8_arg(ptr addrspace(1) nocapture %out, <5 x i8> %in) nounwind { ; SI-LABEL: v5i8_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1545,7 +1545,7 @@ define amdgpu_kernel void @v5i8_arg(ptr addrspace(1) nocapture %out, <5 x i8> %i ; ; VI-LABEL: v5i8_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s0, 4 ; VI-NEXT: s_addc_u32 s5, s1, 0 @@ -1561,7 +1561,7 @@ define amdgpu_kernel void @v5i8_arg(ptr addrspace(1) nocapture %out, <5 x i8> %i ; ; GFX9-LABEL: v5i8_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 @@ -1671,50 +1671,50 @@ entry: define amdgpu_kernel void @v5i16_arg(ptr addrspace(1) nocapture %out, <5 x i16> %in) nounwind { ; SI-LABEL: v5i16_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s2, s[0:1], 0xf -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dword s6, s[2:3], 0xf +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:8 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v5i16_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s5, s[0:1], 0x3c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s5, s[2:3], 0x3c +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s4, s2, 8 +; VI-NEXT: s_add_u32 s4, s0, 8 ; VI-NEXT: v_mov_b32_e32 v4, s5 -; VI-NEXT: s_addc_u32 s5, s3, 0 +; VI-NEXT: s_addc_u32 s5, s1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: flat_store_short v[2:3], v4 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v5i16_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_short v2, v3, s[6:7] offset:8 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_short v2, v3, s[4:5] offset:8 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v5i16_arg: @@ -1902,27 +1902,27 @@ entry: define amdgpu_kernel void @v5i32_arg(ptr addrspace(1) nocapture %out, <5 x i32> %in) nounwind { ; SI-LABEL: v5i32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s8, s[0:1], 0x15 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x11 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dword s8, s[2:3], 0x15 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x11 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: v_mov_b32_e32 v2, s2 -; SI-NEXT: v_mov_b32_e32 v3, s3 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v5i32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s7, s[0:1], 0x54 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x44 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dword s7, s[2:3], 0x54 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x44 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s6, s4, 16 ; VI-NEXT: v_mov_b32_e32 v2, s7 @@ -1941,9 +1941,9 @@ define amdgpu_kernel void @v5i32_arg(ptr addrspace(1) nocapture %out, <5 x i32> ; ; GFX9-LABEL: v5i32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s8, s[4:5], 0x30 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s8, s[6:7], 0x30 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x20 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v5, s8 @@ -1951,8 +1951,8 @@ define amdgpu_kernel void @v5i32_arg(ptr addrspace(1) nocapture %out, <5 x i32> ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: global_store_dword v4, v5, s[6:7] offset:16 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GFX9-NEXT: global_store_dword v4, v5, s[4:5] offset:16 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v5i32_arg: @@ -2000,27 +2000,27 @@ entry: define amdgpu_kernel void @v5f32_arg(ptr addrspace(1) nocapture %out, <5 x float> %in) nounwind { ; SI-LABEL: v5f32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s8, s[0:1], 0x15 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x11 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dword s8, s[2:3], 0x15 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x11 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: v_mov_b32_e32 v2, s2 -; SI-NEXT: v_mov_b32_e32 v3, s3 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v5f32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s7, s[0:1], 0x54 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x44 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dword s7, s[2:3], 0x54 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x44 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s6, s4, 16 ; VI-NEXT: v_mov_b32_e32 v3, s7 @@ -2039,19 +2039,19 @@ define amdgpu_kernel void @v5f32_arg(ptr addrspace(1) nocapture %out, <5 x float ; ; GFX9-LABEL: v5f32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x20 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30 +; GFX9-NEXT: s_load_dword s6, s[6:7], 0x30 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: global_store_dword v4, v0, s[6:7] offset:16 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: global_store_dword v4, v0, s[4:5] offset:16 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v5f32_arg: @@ -2099,34 +2099,34 @@ entry: define amdgpu_kernel void @v5i64_arg(ptr addrspace(1) nocapture %out, <5 x i64> %in) nounwind { ; SI-LABEL: v5i64_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x19 -; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x21 -; SI-NEXT: s_mov_b32 s15, 0xf000 -; SI-NEXT: s_mov_b32 s14, -1 +; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x19 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x21 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: v_mov_b32_e32 v2, s10 ; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[12:15], 0 offset:32 +; SI-NEXT: v_mov_b32_e32 v0, s12 +; SI-NEXT: v_mov_b32_e32 v1, s13 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:32 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v5i64_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x84 -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x64 +; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[10:11], s[2:3], 0x84 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x64 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s12, s8, 32 ; VI-NEXT: v_mov_b32_e32 v1, s10 @@ -2155,9 +2155,9 @@ define amdgpu_kernel void @v5i64_arg(ptr addrspace(1) nocapture %out, <5 x i64> ; ; GFX9-LABEL: v5i64_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60 -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x60 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x40 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s1 @@ -2241,34 +2241,34 @@ entry: define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x double> %in) nounwind { ; SI-LABEL: v5f64_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x19 -; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x21 -; SI-NEXT: s_mov_b32 s15, 0xf000 -; SI-NEXT: s_mov_b32 s14, -1 +; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x19 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x21 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: v_mov_b32_e32 v2, s10 ; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[12:15], 0 offset:32 +; SI-NEXT: v_mov_b32_e32 v0, s12 +; SI-NEXT: v_mov_b32_e32 v1, s13 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:32 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v5f64_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x84 -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x64 +; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[10:11], s[2:3], 0x84 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x64 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s12, s8, 32 ; VI-NEXT: v_mov_b32_e32 v1, s10 @@ -2297,9 +2297,9 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x doubl ; ; GFX9-LABEL: v5f64_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60 -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x60 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x40 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s1 @@ -2384,7 +2384,7 @@ entry: define amdgpu_kernel void @v8i8_arg(ptr addrspace(1) %out, <8 x i8> %in) { ; SI-LABEL: v8i8_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2397,7 +2397,7 @@ define amdgpu_kernel void @v8i8_arg(ptr addrspace(1) %out, <8 x i8> %in) { ; ; VI-LABEL: v8i8_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -2408,7 +2408,7 @@ define amdgpu_kernel void @v8i8_arg(ptr addrspace(1) %out, <8 x i8> %in) { ; ; GFX9-LABEL: v8i8_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -2635,8 +2635,8 @@ entry: define amdgpu_kernel void @v8i16_arg(ptr addrspace(1) %out, <8 x i16> %in) { ; SI-LABEL: v8i16_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2649,8 +2649,8 @@ define amdgpu_kernel void @v8i16_arg(ptr addrspace(1) %out, <8 x i16> %in) { ; ; VI-LABEL: v8i16_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2663,15 +2663,15 @@ define amdgpu_kernel void @v8i16_arg(ptr addrspace(1) %out, <8 x i16> %in) { ; ; GFX9-LABEL: v8i16_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v8i16_arg: @@ -2883,8 +2883,8 @@ entry: define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> %in) nounwind { ; SI-LABEL: v8i32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x11 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2903,8 +2903,8 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> ; ; VI-LABEL: v8i32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: s_add_u32 s2, s0, 16 @@ -2926,8 +2926,8 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> ; ; GFX9-LABEL: v8i32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x20 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s12 @@ -2994,8 +2994,8 @@ entry: define amdgpu_kernel void @v8f32_arg(ptr addrspace(1) nocapture %out, <8 x float> %in) nounwind { ; SI-LABEL: v8f32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x11 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3014,8 +3014,8 @@ define amdgpu_kernel void @v8f32_arg(ptr addrspace(1) nocapture %out, <8 x float ; ; VI-LABEL: v8f32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: s_add_u32 s2, s0, 16 @@ -3037,8 +3037,8 @@ define amdgpu_kernel void @v8f32_arg(ptr addrspace(1) nocapture %out, <8 x float ; ; GFX9-LABEL: v8f32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x20 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s12 @@ -3106,8 +3106,8 @@ entry: define amdgpu_kernel void @v16i8_arg(ptr addrspace(1) %out, <16 x i8> %in) { ; SI-LABEL: v16i8_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3120,8 +3120,8 @@ define amdgpu_kernel void @v16i8_arg(ptr addrspace(1) %out, <16 x i8> %in) { ; ; VI-LABEL: v16i8_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -3134,15 +3134,15 @@ define amdgpu_kernel void @v16i8_arg(ptr addrspace(1) %out, <16 x i8> %in) { ; ; GFX9-LABEL: v16i8_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v16i8_arg: @@ -3556,8 +3556,8 @@ entry: define amdgpu_kernel void @v16i16_arg(ptr addrspace(1) %out, <16 x i16> %in) { ; SI-LABEL: v16i16_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x11 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3576,8 +3576,8 @@ define amdgpu_kernel void @v16i16_arg(ptr addrspace(1) %out, <16 x i16> %in) { ; ; VI-LABEL: v16i16_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: s_add_u32 s2, s0, 16 @@ -3599,8 +3599,8 @@ define amdgpu_kernel void @v16i16_arg(ptr addrspace(1) %out, <16 x i16> %in) { ; ; GFX9-LABEL: v16i16_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x20 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s12 @@ -4012,8 +4012,8 @@ entry: define amdgpu_kernel void @v16i32_arg(ptr addrspace(1) nocapture %out, <16 x i32> %in) nounwind { ; SI-LABEL: v16i32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x19 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4044,8 +4044,8 @@ define amdgpu_kernel void @v16i32_arg(ptr addrspace(1) nocapture %out, <16 x i32 ; ; VI-LABEL: v16i32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: s_add_u32 s2, s0, 48 @@ -4085,8 +4085,8 @@ define amdgpu_kernel void @v16i32_arg(ptr addrspace(1) nocapture %out, <16 x i32 ; ; GFX9-LABEL: v16i32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x40 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s20 @@ -4200,8 +4200,8 @@ entry: define amdgpu_kernel void @v16f32_arg(ptr addrspace(1) nocapture %out, <16 x float> %in) nounwind { ; SI-LABEL: v16f32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x19 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4232,8 +4232,8 @@ define amdgpu_kernel void @v16f32_arg(ptr addrspace(1) nocapture %out, <16 x flo ; ; VI-LABEL: v16f32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: s_add_u32 s2, s0, 48 @@ -4273,8 +4273,8 @@ define amdgpu_kernel void @v16f32_arg(ptr addrspace(1) nocapture %out, <16 x flo ; ; GFX9-LABEL: v16f32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x40 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s20 @@ -4388,7 +4388,7 @@ entry: define amdgpu_kernel void @kernel_arg_i64(ptr addrspace(1) %out, i64 %a) nounwind { ; SI-LABEL: kernel_arg_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4401,7 +4401,7 @@ define amdgpu_kernel void @kernel_arg_i64(ptr addrspace(1) %out, i64 %a) nounwin ; ; VI-LABEL: kernel_arg_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -4412,7 +4412,7 @@ define amdgpu_kernel void @kernel_arg_i64(ptr addrspace(1) %out, i64 %a) nounwin ; ; GFX9-LABEL: kernel_arg_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -4450,7 +4450,7 @@ define amdgpu_kernel void @kernel_arg_i64(ptr addrspace(1) %out, i64 %a) nounwin define amdgpu_kernel void @f64_kernel_arg(ptr addrspace(1) %out, double %in) { ; SI-LABEL: f64_kernel_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4463,7 +4463,7 @@ define amdgpu_kernel void @f64_kernel_arg(ptr addrspace(1) %out, double %in) { ; ; VI-LABEL: f64_kernel_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -4474,7 +4474,7 @@ define amdgpu_kernel void @f64_kernel_arg(ptr addrspace(1) %out, double %in) { ; ; GFX9-LABEL: f64_kernel_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -4522,8 +4522,8 @@ entry: define amdgpu_kernel void @i65_arg(ptr addrspace(1) nocapture %out, i65 %in) nounwind { ; SI-LABEL: i65_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_and_b32 s8, s4, 1 @@ -4539,8 +4539,8 @@ define amdgpu_kernel void @i65_arg(ptr addrspace(1) nocapture %out, i65 %in) nou ; ; VI-LABEL: i65_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s4, s4, 1 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -4558,11 +4558,11 @@ define amdgpu_kernel void @i65_arg(ptr addrspace(1) nocapture %out, i65 %in) nou ; ; GFX9-LABEL: i65_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s4, s6, 1 +; GFX9-NEXT: s_and_b32 s4, s4, 1 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 @@ -4640,11 +4640,11 @@ entry: define amdgpu_kernel void @i1_arg(ptr addrspace(1) %out, i1 %x) nounwind { ; SI-LABEL: i1_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s4, s2, 1 +; SI-NEXT: s_and_b32 s4, s4, 1 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 @@ -4652,10 +4652,10 @@ define amdgpu_kernel void @i1_arg(ptr addrspace(1) %out, i1 %x) nounwind { ; ; VI-LABEL: i1_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s2, 1 +; VI-NEXT: s_and_b32 s2, s4, 1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -4664,8 +4664,8 @@ define amdgpu_kernel void @i1_arg(ptr addrspace(1) %out, i1 %x) nounwind { ; ; GFX9-LABEL: i1_arg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 1 @@ -4731,11 +4731,11 @@ define amdgpu_kernel void @i1_arg(ptr addrspace(1) %out, i1 %x) nounwind { define amdgpu_kernel void @i1_arg_zext_i32(ptr addrspace(1) %out, i1 %x) nounwind { ; SI-LABEL: i1_arg_zext_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s4, s2, 1 +; SI-NEXT: s_and_b32 s4, s4, 1 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -4743,10 +4743,10 @@ define amdgpu_kernel void @i1_arg_zext_i32(ptr addrspace(1) %out, i1 %x) nounwin ; ; VI-LABEL: i1_arg_zext_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s2, 1 +; VI-NEXT: s_and_b32 s2, s4, 1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -4755,8 +4755,8 @@ define amdgpu_kernel void @i1_arg_zext_i32(ptr addrspace(1) %out, i1 %x) nounwin ; ; GFX9-LABEL: i1_arg_zext_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 1 @@ -4803,8 +4803,8 @@ define amdgpu_kernel void @i1_arg_zext_i32(ptr addrspace(1) %out, i1 %x) nounwin define amdgpu_kernel void @i1_arg_zext_i64(ptr addrspace(1) %out, i1 %x) nounwind { ; SI-LABEL: i1_arg_zext_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4816,11 +4816,11 @@ define amdgpu_kernel void @i1_arg_zext_i64(ptr addrspace(1) %out, i1 %x) nounwin ; ; VI-LABEL: i1_arg_zext_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s2, 1 +; VI-NEXT: s_and_b32 s2, s4, 1 ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v2, s0 @@ -4829,8 +4829,8 @@ define amdgpu_kernel void @i1_arg_zext_i64(ptr addrspace(1) %out, i1 %x) nounwin ; ; GFX9-LABEL: i1_arg_zext_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 1 @@ -4879,11 +4879,11 @@ define amdgpu_kernel void @i1_arg_zext_i64(ptr addrspace(1) %out, i1 %x) nounwin define amdgpu_kernel void @i1_arg_sext_i32(ptr addrspace(1) %out, i1 %x) nounwind { ; SI-LABEL: i1_arg_sext_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_i32 s4, s2, 0x10000 +; SI-NEXT: s_bfe_i32 s4, s4, 0x10000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -4891,10 +4891,10 @@ define amdgpu_kernel void @i1_arg_sext_i32(ptr addrspace(1) %out, i1 %x) nounwin ; ; VI-LABEL: i1_arg_sext_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bfe_i32 s2, s2, 0x10000 +; VI-NEXT: s_bfe_i32 s2, s4, 0x10000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -4903,8 +4903,8 @@ define amdgpu_kernel void @i1_arg_sext_i32(ptr addrspace(1) %out, i1 %x) nounwin ; ; GFX9-LABEL: i1_arg_sext_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_i32 s2, s2, 0x10000 @@ -4953,11 +4953,11 @@ define amdgpu_kernel void @i1_arg_sext_i32(ptr addrspace(1) %out, i1 %x) nounwin define amdgpu_kernel void @i1_arg_sext_i64(ptr addrspace(1) %out, i1 %x) nounwind { ; SI-LABEL: i1_arg_sext_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x10000 +; SI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -4966,21 +4966,21 @@ define amdgpu_kernel void @i1_arg_sext_i64(ptr addrspace(1) %out, i1 %x) nounwin ; ; VI-LABEL: i1_arg_sext_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x10000 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: i1_arg_sext_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s0, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x10000 @@ -5062,10 +5062,10 @@ define amdgpu_kernel void @empty_struct_arg({} %in) nounwind { define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, i64} %arg1) { ; SI-LABEL: struct_argument_alignment: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s8, s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_load_dword s9, s[0:1], 0xf -; SI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x11 +; SI-NEXT: s_load_dword s8, s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xb +; SI-NEXT: s_load_dword s9, s[2:3], 0xf +; SI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x11 ; SI-NEXT: s_mov_b32 s0, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -5089,46 +5089,46 @@ define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, ; ; VI-LABEL: struct_argument_alignment: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_load_dword s5, s[0:1], 0x3c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 +; VI-NEXT: s_load_dword s4, s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; VI-NEXT: s_load_dword s5, s[2:3], 0x3c +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x44 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: struct_argument_alignment: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX9-NEXT: s_load_dword s7, s[4:5], 0x18 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x20 +; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; GFX9-NEXT: s_load_dword s5, s[6:7], 0x18 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x20 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s2 @@ -5196,6 +5196,7 @@ define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) { ; SI-LABEL: packed_struct_argument_alignment: ; SI: ; %bb.0: +; SI-NEXT: s_mov_b64 s[0:1], s[2:3] ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_load_dword s6, s[0:1], 0x9 @@ -5229,37 +5230,37 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, ; ; VI-LABEL: packed_struct_argument_alignment: ; VI: ; %bb.0: -; VI-NEXT: s_add_u32 s2, s0, 49 -; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: s_add_u32 s4, s0, 50 -; VI-NEXT: s_addc_u32 s5, s1, 0 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: s_add_u32 s2, s2, 3 -; VI-NEXT: s_addc_u32 s3, s3, 0 -; VI-NEXT: v_mov_b32_e32 v5, s3 -; VI-NEXT: v_mov_b32_e32 v4, s2 -; VI-NEXT: s_add_u32 s2, s0, 51 -; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: s_add_u32 s0, s2, 49 +; VI-NEXT: s_addc_u32 s1, s3, 0 +; VI-NEXT: s_add_u32 s4, s2, 50 +; VI-NEXT: s_addc_u32 s5, s3, 0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_add_u32 s0, s0, 3 +; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: s_add_u32 s0, s2, 51 +; VI-NEXT: s_addc_u32 s1, s3, 0 ; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v7, s3 +; VI-NEXT: v_mov_b32_e32 v7, s1 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v6, s2 +; VI-NEXT: v_mov_b32_e32 v6, s0 ; VI-NEXT: flat_load_ubyte v8, v[0:1] ; VI-NEXT: flat_load_ubyte v9, v[2:3] ; VI-NEXT: flat_load_ubyte v10, v[4:5] ; VI-NEXT: flat_load_ubyte v6, v[6:7] -; VI-NEXT: s_add_u32 s2, s0, 53 -; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_add_u32 s0, s2, 53 +; VI-NEXT: s_addc_u32 s1, s3, 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: s_load_dword s2, s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x28 +; VI-NEXT: s_load_dword s4, s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x28 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: v_mov_b32_e32 v3, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v7, s2 +; VI-NEXT: v_mov_b32_e32 v7, s4 ; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dword v[2:3], v7 @@ -5280,10 +5281,10 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, ; GFX9-LABEL: packed_struct_argument_alignment: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_load_dword v6, v2, s[4:5] offset:13 -; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:17 -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4 +; GFX9-NEXT: global_load_dword v6, v2, s[6:7] offset:13 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] offset:17 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x4 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -5379,11 +5380,11 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, {i32, i64} %arg2, i8, <4 x i32> %arg4) { ; SI-LABEL: struct_argument_alignment_after: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s12, s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb -; SI-NEXT: s_load_dword s13, s[0:1], 0xf -; SI-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x11 -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x15 +; SI-NEXT: s_load_dword s12, s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xb +; SI-NEXT: s_load_dword s13, s[2:3], 0xf +; SI-NEXT: s_load_dwordx2 s[10:11], s[2:3], 0x11 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x15 ; SI-NEXT: s_mov_b32 s4, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 @@ -5413,11 +5414,11 @@ define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, ; ; VI-LABEL: struct_argument_alignment_after: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s8, s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; VI-NEXT: s_load_dword s9, s[0:1], 0x3c -; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x44 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x54 +; VI-NEXT: s_load_dword s8, s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x2c +; VI-NEXT: s_load_dword s9, s[2:3], 0x3c +; VI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x44 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x54 ; VI-NEXT: v_mov_b32_e32 v4, 0 ; VI-NEXT: v_mov_b32_e32 v5, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5445,19 +5446,19 @@ define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, ; ; GFX9-LABEL: struct_argument_alignment_after: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s10, s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 -; GFX9-NEXT: s_load_dword s11, s[4:5], 0x18 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x20 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x30 +; GFX9-NEXT: s_load_dword s10, s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x8 +; GFX9-NEXT: s_load_dword s11, s[6:7], 0x18 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x20 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x30 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s10 ; GFX9-NEXT: global_store_dword v[4:5], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s11 @@ -5545,7 +5546,7 @@ define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) { ; SI-LABEL: array_3xi32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -5565,7 +5566,7 @@ define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) { ; ; VI-LABEL: array_3xi32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -5583,7 +5584,7 @@ define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) { ; ; GFX9-LABEL: array_3xi32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 @@ -5659,7 +5660,8 @@ define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) { define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) { ; SI-LABEL: array_3xi16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0x9 +; SI-NEXT: s_mov_b64 s[0:1], s[2:3] +; SI-NEXT: s_load_dword s4, s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:42 @@ -5679,22 +5681,22 @@ define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) { ; ; VI-LABEL: array_3xi16: ; VI: ; %bb.0: -; VI-NEXT: s_add_u32 s2, s0, 38 -; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: s_add_u32 s4, s2, 2 -; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: s_add_u32 s2, s0, 42 -; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_add_u32 s0, s2, 38 +; VI-NEXT: s_addc_u32 s1, s3, 0 +; VI-NEXT: s_add_u32 s4, s0, 2 +; VI-NEXT: s_addc_u32 s5, s1, 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_add_u32 s0, s2, 42 +; VI-NEXT: s_addc_u32 s1, s3, 0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_load_ushort v4, v[0:1] ; VI-NEXT: flat_load_ushort v2, v[2:3] ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_ushort v0, v[0:1] -; VI-NEXT: s_load_dword s0, s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -5711,10 +5713,10 @@ define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) { ; GFX9-LABEL: array_3xi16: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_load_ushort v1, v0, s[4:5] offset:6 -; GFX9-NEXT: global_load_ushort v2, v0, s[4:5] offset:4 -; GFX9-NEXT: global_load_ushort v3, v0, s[4:5] offset:2 -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] offset:6 +; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] offset:4 +; GFX9-NEXT: global_load_ushort v3, v0, s[6:7] offset:2 +; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_waitcnt vmcnt(2) @@ -5829,6 +5831,7 @@ define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) { define amdgpu_kernel void @small_array_round_down_offset(i8, [1 x i8] %arg) { ; SI-LABEL: small_array_round_down_offset: ; SI: ; %bb.0: +; SI-NEXT: s_mov_b64 s[0:1], s[2:3] ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:37 @@ -5839,8 +5842,8 @@ define amdgpu_kernel void @small_array_round_down_offset(i8, [1 x i8] %arg) { ; ; VI-LABEL: small_array_round_down_offset: ; VI: ; %bb.0: -; VI-NEXT: s_add_u32 s0, s0, 37 -; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: s_add_u32 s0, s2, 37 +; VI-NEXT: s_addc_u32 s1, s3, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_load_ubyte v0, v[0:1] @@ -5852,7 +5855,7 @@ define amdgpu_kernel void @small_array_round_down_offset(i8, [1 x i8] %arg) { ; GFX9-LABEL: small_array_round_down_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_load_ubyte v0, v0, s[4:5] offset:1 +; GFX9-NEXT: global_load_ubyte v0, v0, s[6:7] offset:1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -5886,8 +5889,8 @@ define amdgpu_kernel void @small_array_round_down_offset(i8, [1 x i8] %arg) { define amdgpu_kernel void @byref_align_constant_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 %after.offset) { ; SI-LABEL: byref_align_constant_i32_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x49 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x49 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -5901,13 +5904,13 @@ define amdgpu_kernel void @byref_align_constant_i32_arg(ptr addrspace(1) nocaptu ; ; VI-LABEL: byref_align_constant_i32_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x124 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x124 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_store_dword v[0:1], v3 @@ -5916,8 +5919,8 @@ define amdgpu_kernel void @byref_align_constant_i32_arg(ptr addrspace(1) nocaptu ; ; GFX9-LABEL: byref_align_constant_i32_arg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x100 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s0 @@ -5970,83 +5973,83 @@ define amdgpu_kernel void @byref_align_constant_i32_arg(ptr addrspace(1) nocaptu define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(ptr addrspace(1) nocapture %out, i8, ptr addrspace(4) byref(<16 x i32>) align(64) %in.byref, i32 %after.offset) { ; SI-LABEL: byref_natural_align_constant_v16i32_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x19 -; SI-NEXT: s_load_dwordx2 s[20:21], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0x29 -; SI-NEXT: s_mov_b32 s23, 0xf000 -; SI-NEXT: s_mov_b32 s22, -1 +; SI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s20, s[2:3], 0x29 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s16 ; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v2, s18 ; SI-NEXT: v_mov_b32_e32 v3, s19 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 offset:48 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s12 ; SI-NEXT: v_mov_b32_e32 v1, s13 ; SI-NEXT: v_mov_b32_e32 v2, s14 ; SI-NEXT: v_mov_b32_e32 v3, s15 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 offset:32 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: v_mov_b32_e32 v2, s10 ; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_dword v0, off, s[20:23], 0 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_endpgm ; ; VI-LABEL: byref_natural_align_constant_v16i32_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s20, s[0:1], 0xa4 +; VI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s20, s[2:3], 0xa4 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: s_add_u32 s0, s2, 48 -; VI-NEXT: s_addc_u32 s1, s3, 0 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: s_add_u32 s0, s2, 32 +; VI-NEXT: s_add_u32 s2, s0, 48 +; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: s_add_u32 s2, s0, 32 ; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 ; VI-NEXT: v_mov_b32_e32 v3, s19 -; VI-NEXT: s_addc_u32 s1, s3, 0 +; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: s_add_u32 s0, s2, 16 +; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: v_mov_b32_e32 v0, s12 ; VI-NEXT: v_mov_b32_e32 v1, s13 ; VI-NEXT: v_mov_b32_e32 v2, s14 ; VI-NEXT: v_mov_b32_e32 v3, s15 -; VI-NEXT: s_addc_u32 s1, s3, 0 +; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: v_mov_b32_e32 v2, s10 ; VI-NEXT: v_mov_b32_e32 v3, s11 -; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s20 @@ -6056,9 +6059,9 @@ define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(ptr addrspace ; ; GFX9-LABEL: byref_natural_align_constant_v16i32_arg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x80 +; GFX9-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x40 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x80 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s20 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll index 45a1afbf11992..77722d96d5a4c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, float %x, float %y) #0 { ; SI-LABEL: s_cvt_pkrtz_v2f16_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -21,7 +21,7 @@ define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, float %x ; ; VI-LABEL: s_cvt_pkrtz_v2f16_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s3 ; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, s2, v0 @@ -32,7 +32,7 @@ define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, float %x ; ; GFX9-LABEL: s_cvt_pkrtz_v2f16_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s7 @@ -42,7 +42,7 @@ define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, float %x ; ; GFX10-LABEL: s_cvt_pkrtz_v2f16_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, s6, s7 @@ -51,7 +51,7 @@ define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, float %x ; ; GFX11-LABEL: s_cvt_pkrtz_v2f16_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, s2, s3 @@ -67,8 +67,8 @@ define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, float %x define amdgpu_kernel void @s_cvt_pkrtz_samereg_v2f16_f32(ptr addrspace(1) %out, float %x) #0 { ; SI-LABEL: s_cvt_pkrtz_samereg_v2f16_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -78,10 +78,10 @@ define amdgpu_kernel void @s_cvt_pkrtz_samereg_v2f16_f32(ptr addrspace(1) %out, ; ; VI-LABEL: s_cvt_pkrtz_samereg_v2f16_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, s2, s2 +; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, s4, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -89,33 +89,33 @@ define amdgpu_kernel void @s_cvt_pkrtz_samereg_v2f16_f32(ptr addrspace(1) %out, ; ; GFX9-LABEL: s_cvt_pkrtz_samereg_v2f16_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, s4, s4 -; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_cvt_pkrtz_samereg_v2f16_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, s4, s4 -; GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_cvt_pkrtz_samereg_v2f16_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, s2, s2 +; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, s4, s4 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -145,8 +145,8 @@ define amdgpu_kernel void @s_cvt_pkrtz_undef_undef(ptr addrspace(1) %out) #0 { define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_cvt_pkrtz_v2f16_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -165,8 +165,8 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, ptr addr ; ; VI-LABEL: v_cvt_pkrtz_v2f16_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -188,13 +188,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -203,13 +203,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, ptr addr ; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e32 v1, v1, v2 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] @@ -218,8 +218,10 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, ptr addr ; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -246,7 +248,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 { ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -262,7 +264,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(ptr addrspace(1) %out, ; ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -279,7 +281,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -290,7 +292,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(ptr addrspace(1) %out, ; ; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc @@ -301,7 +303,9 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(ptr addrspace(1) %out, ; ; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -324,7 +328,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(ptr addrspace(1) %out, define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 { ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -340,7 +344,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(ptr addrspace(1) %out, ; ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -357,7 +361,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -368,7 +372,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(ptr addrspace(1) %out, ; ; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc @@ -379,7 +383,9 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(ptr addrspace(1) %out, ; ; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -402,8 +408,8 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(ptr addrspace(1) %out, define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -422,8 +428,8 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(ptr addrspace(1) %out, ; ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -445,13 +451,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, -v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -460,13 +466,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(ptr addrspace(1) %out, ; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, -v1, v2 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] @@ -475,8 +481,10 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(ptr addrspace(1) %out, ; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -504,8 +512,8 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(ptr addrspace(1) %out, define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -524,8 +532,8 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(ptr addrspace(1) %out, ; ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -547,13 +555,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, v1, -v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -562,13 +570,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(ptr addrspace(1) %out, ; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, v1, -v2 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] @@ -577,8 +585,10 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(ptr addrspace(1) %out, ; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -606,8 +616,8 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(ptr addrspace(1) %out, define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -626,8 +636,8 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(ptr addrspace(1) %ou ; ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -649,13 +659,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, -v1, -v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -664,13 +674,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(ptr addrspace(1) %ou ; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, -v1, -v2 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] @@ -679,8 +689,10 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(ptr addrspace(1) %ou ; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -709,8 +721,8 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(ptr addrspace(1) %ou define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -729,8 +741,8 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(ptr addrsp ; ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -752,13 +764,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(ptr addrsp ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, -|v1|, -v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -767,13 +779,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(ptr addrsp ; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, -|v1|, -v2 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] @@ -782,8 +794,10 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(ptr addrsp ; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll index 5d20a848bd6a6..d70df38fe6037 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll @@ -15,7 +15,7 @@ declare half @llvm.fabs.f16(half) #0 define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float %src, float %a) { ; SDAG-GFX11-LABEL: v_fcmp_f32_oeq_with_fabs: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_eq_f32_e64 s2, s2, |s3| ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -27,7 +27,7 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float ; ; SDAG-GFX10-LABEL: v_fcmp_f32_oeq_with_fabs: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_eq_f32_e64 s0, s6, |s7| @@ -37,7 +37,7 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float ; ; GISEL-GFX11-LABEL: v_fcmp_f32_oeq_with_fabs: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_eq_f32_e64 s2, s2, |s3| @@ -50,7 +50,7 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float ; ; GISEL-GFX10-LABEL: v_fcmp_f32_oeq_with_fabs: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_eq_f32_e64 s0, s6, |s7| @@ -66,7 +66,7 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace(1) %out, float %src, float %a) { ; SDAG-GFX11-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_eq_f32_e64 s2, |s2|, |s3| ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -78,7 +78,7 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace( ; ; SDAG-GFX10-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_eq_f32_e64 s0, |s6|, |s7| @@ -88,7 +88,7 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace( ; ; GISEL-GFX11-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_eq_f32_e64 s2, |s2|, |s3| @@ -101,7 +101,7 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace( ; ; GISEL-GFX10-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_eq_f32_e64 s0, |s6|, |s7| @@ -126,7 +126,7 @@ define amdgpu_kernel void @v_fcmp_f32(ptr addrspace(1) %out, float %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f32: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: global_store_b32 v0, v0, s[0:1] @@ -136,7 +136,7 @@ define amdgpu_kernel void @v_fcmp_f32(ptr addrspace(1) %out, float %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f32: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: global_store_dword v0, v0, s[0:1] @@ -150,10 +150,10 @@ define amdgpu_kernel void @v_fcmp_f32_oeq(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_oeq: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_eq_f32_e64 s2, 0x42c80000, s2 +; SDAG-GFX11-NEXT: v_cmp_eq_f32_e64 s2, 0x42c80000, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -164,23 +164,23 @@ define amdgpu_kernel void @v_fcmp_f32_oeq(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_oeq: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_eq_f32_e64 s0, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_eq_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_oeq: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_eq_f32_e64 s2, 0x42c80000, s2 +; GISEL-GFX11-NEXT: v_cmp_eq_f32_e64 s2, 0x42c80000, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -191,13 +191,13 @@ define amdgpu_kernel void @v_fcmp_f32_oeq(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_oeq: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_eq_f32_e64 s0, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_eq_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 1) store i32 %result, ptr addrspace(1) %out @@ -208,10 +208,10 @@ define amdgpu_kernel void @v_fcmp_f32_one(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_one: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s2 +; SDAG-GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -222,23 +222,23 @@ define amdgpu_kernel void @v_fcmp_f32_one(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_one: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_neq_f32_e64 s0, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_one: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s2 +; GISEL-GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -249,13 +249,13 @@ define amdgpu_kernel void @v_fcmp_f32_one(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_one: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_neq_f32_e64 s0, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 6) store i32 %result, ptr addrspace(1) %out @@ -266,10 +266,10 @@ define amdgpu_kernel void @v_fcmp_f32_ogt(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_ogt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_lt_f32_e64 s2, 0x42c80000, s2 +; SDAG-GFX11-NEXT: v_cmp_lt_f32_e64 s2, 0x42c80000, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -280,23 +280,23 @@ define amdgpu_kernel void @v_fcmp_f32_ogt(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_ogt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_lt_f32_e64 s0, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_lt_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_ogt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_lt_f32_e64 s2, 0x42c80000, s2 +; GISEL-GFX11-NEXT: v_cmp_lt_f32_e64 s2, 0x42c80000, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -307,13 +307,13 @@ define amdgpu_kernel void @v_fcmp_f32_ogt(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_ogt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_lt_f32_e64 s0, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_lt_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 2) store i32 %result, ptr addrspace(1) %out @@ -324,10 +324,10 @@ define amdgpu_kernel void @v_fcmp_f32_oge(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_oge: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_le_f32_e64 s2, 0x42c80000, s2 +; SDAG-GFX11-NEXT: v_cmp_le_f32_e64 s2, 0x42c80000, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -338,23 +338,23 @@ define amdgpu_kernel void @v_fcmp_f32_oge(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_oge: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_le_f32_e64 s0, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_le_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_oge: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_le_f32_e64 s2, 0x42c80000, s2 +; GISEL-GFX11-NEXT: v_cmp_le_f32_e64 s2, 0x42c80000, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -365,13 +365,13 @@ define amdgpu_kernel void @v_fcmp_f32_oge(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_oge: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_le_f32_e64 s0, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_le_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 3) store i32 %result, ptr addrspace(1) %out @@ -382,10 +382,10 @@ define amdgpu_kernel void @v_fcmp_f32_olt(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_olt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_gt_f32_e64 s2, 0x42c80000, s2 +; SDAG-GFX11-NEXT: v_cmp_gt_f32_e64 s2, 0x42c80000, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -396,23 +396,23 @@ define amdgpu_kernel void @v_fcmp_f32_olt(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_olt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_gt_f32_e64 s0, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_gt_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_olt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_gt_f32_e64 s2, 0x42c80000, s2 +; GISEL-GFX11-NEXT: v_cmp_gt_f32_e64 s2, 0x42c80000, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -423,13 +423,13 @@ define amdgpu_kernel void @v_fcmp_f32_olt(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_olt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_gt_f32_e64 s0, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_gt_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 4) store i32 %result, ptr addrspace(1) %out @@ -440,10 +440,10 @@ define amdgpu_kernel void @v_fcmp_f32_ole(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_ole: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_ge_f32_e64 s2, 0x42c80000, s2 +; SDAG-GFX11-NEXT: v_cmp_ge_f32_e64 s2, 0x42c80000, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -454,23 +454,23 @@ define amdgpu_kernel void @v_fcmp_f32_ole(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_ole: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ge_f32_e64 s0, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_ge_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_ole: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_ge_f32_e64 s2, 0x42c80000, s2 +; GISEL-GFX11-NEXT: v_cmp_ge_f32_e64 s2, 0x42c80000, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -481,13 +481,13 @@ define amdgpu_kernel void @v_fcmp_f32_ole(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_ole: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ge_f32_e64 s0, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_ge_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 5) store i32 %result, ptr addrspace(1) %out @@ -498,10 +498,10 @@ define amdgpu_kernel void @v_fcmp_f32_o(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_o: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_o_f32_e64 s2, 0x42c80000, s2 +; SDAG-GFX11-NEXT: v_cmp_o_f32_e64 s2, 0x42c80000, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -512,23 +512,23 @@ define amdgpu_kernel void @v_fcmp_f32_o(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_o: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_o_f32_e64 s0, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_o_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_o: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_o_f32_e64 s2, 0x42c80000, s2 +; GISEL-GFX11-NEXT: v_cmp_o_f32_e64 s2, 0x42c80000, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -539,13 +539,13 @@ define amdgpu_kernel void @v_fcmp_f32_o(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_o: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_o_f32_e64 s0, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_o_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 7) store i32 %result, ptr addrspace(1) %out @@ -556,10 +556,10 @@ define amdgpu_kernel void @v_fcmp_f32_uo(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_uo: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_u_f32_e64 s2, 0x42c80000, s2 +; SDAG-GFX11-NEXT: v_cmp_u_f32_e64 s2, 0x42c80000, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -570,23 +570,23 @@ define amdgpu_kernel void @v_fcmp_f32_uo(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_uo: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_u_f32_e64 s0, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_u_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_uo: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_u_f32_e64 s2, 0x42c80000, s2 +; GISEL-GFX11-NEXT: v_cmp_u_f32_e64 s2, 0x42c80000, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -597,13 +597,13 @@ define amdgpu_kernel void @v_fcmp_f32_uo(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_uo: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_u_f32_e64 s0, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_u_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 8) store i32 %result, ptr addrspace(1) %out @@ -614,10 +614,10 @@ define amdgpu_kernel void @v_fcmp_f32_ueq(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_ueq: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_nlg_f32_e64 s2, 0x42c80000, s2 +; SDAG-GFX11-NEXT: v_cmp_nlg_f32_e64 s2, 0x42c80000, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -628,23 +628,23 @@ define amdgpu_kernel void @v_fcmp_f32_ueq(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_ueq: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_nlg_f32_e64 s0, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_nlg_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_ueq: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_nlg_f32_e64 s2, 0x42c80000, s2 +; GISEL-GFX11-NEXT: v_cmp_nlg_f32_e64 s2, 0x42c80000, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -655,13 +655,13 @@ define amdgpu_kernel void @v_fcmp_f32_ueq(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_ueq: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_nlg_f32_e64 s0, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_nlg_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 9) store i32 %result, ptr addrspace(1) %out @@ -672,10 +672,10 @@ define amdgpu_kernel void @v_fcmp_f32_une(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_une: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s2 +; SDAG-GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -686,23 +686,23 @@ define amdgpu_kernel void @v_fcmp_f32_une(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_une: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_neq_f32_e64 s0, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_une: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s2 +; GISEL-GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -713,13 +713,13 @@ define amdgpu_kernel void @v_fcmp_f32_une(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_une: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_neq_f32_e64 s0, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 14) store i32 %result, ptr addrspace(1) %out @@ -730,10 +730,10 @@ define amdgpu_kernel void @v_fcmp_f32_ugt(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_ugt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_nge_f32_e64 s2, 0x42c80000, s2 +; SDAG-GFX11-NEXT: v_cmp_nge_f32_e64 s2, 0x42c80000, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -744,23 +744,23 @@ define amdgpu_kernel void @v_fcmp_f32_ugt(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_ugt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_nge_f32_e64 s0, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_nge_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_ugt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_nge_f32_e64 s2, 0x42c80000, s2 +; GISEL-GFX11-NEXT: v_cmp_nge_f32_e64 s2, 0x42c80000, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -771,13 +771,13 @@ define amdgpu_kernel void @v_fcmp_f32_ugt(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_ugt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_nge_f32_e64 s0, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_nge_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 10) store i32 %result, ptr addrspace(1) %out @@ -788,10 +788,10 @@ define amdgpu_kernel void @v_fcmp_f32_uge(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_uge: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_ngt_f32_e64 s2, 0x42c80000, s2 +; SDAG-GFX11-NEXT: v_cmp_ngt_f32_e64 s2, 0x42c80000, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -802,23 +802,23 @@ define amdgpu_kernel void @v_fcmp_f32_uge(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_uge: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ngt_f32_e64 s0, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_ngt_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_uge: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_ngt_f32_e64 s2, 0x42c80000, s2 +; GISEL-GFX11-NEXT: v_cmp_ngt_f32_e64 s2, 0x42c80000, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -829,13 +829,13 @@ define amdgpu_kernel void @v_fcmp_f32_uge(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_uge: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ngt_f32_e64 s0, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_ngt_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 11) store i32 %result, ptr addrspace(1) %out @@ -846,10 +846,10 @@ define amdgpu_kernel void @v_fcmp_f32_ult(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_ult: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_nle_f32_e64 s2, 0x42c80000, s2 +; SDAG-GFX11-NEXT: v_cmp_nle_f32_e64 s2, 0x42c80000, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -860,23 +860,23 @@ define amdgpu_kernel void @v_fcmp_f32_ult(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_ult: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_nle_f32_e64 s0, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_nle_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_ult: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_nle_f32_e64 s2, 0x42c80000, s2 +; GISEL-GFX11-NEXT: v_cmp_nle_f32_e64 s2, 0x42c80000, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -887,13 +887,13 @@ define amdgpu_kernel void @v_fcmp_f32_ult(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_ult: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_nle_f32_e64 s0, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_nle_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 12) store i32 %result, ptr addrspace(1) %out @@ -904,10 +904,10 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_ule: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_nlt_f32_e64 s2, 0x42c80000, s2 +; SDAG-GFX11-NEXT: v_cmp_nlt_f32_e64 s2, 0x42c80000, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -918,23 +918,23 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_ule: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_nlt_f32_e64 s0, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_nlt_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_ule: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_nlt_f32_e64 s2, 0x42c80000, s2 +; GISEL-GFX11-NEXT: v_cmp_nlt_f32_e64 s2, 0x42c80000, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -945,13 +945,13 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_ule: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_nlt_f32_e64 s0, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_nlt_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 13) store i32 %result, ptr addrspace(1) %out @@ -961,7 +961,7 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) { define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_oeq: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_eq_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -973,7 +973,7 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_oeq: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_eq_f64_e64 s2, 0x40590000, s[2:3] @@ -983,7 +983,7 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_oeq: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_eq_f64_e64 s2, 0x40590000, s[2:3] @@ -996,7 +996,7 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_oeq: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_eq_f64_e64 s2, 0x40590000, s[2:3] @@ -1011,7 +1011,7 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_one: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1023,7 +1023,7 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_one: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3] @@ -1033,7 +1033,7 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_one: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3] @@ -1046,7 +1046,7 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_one: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3] @@ -1061,7 +1061,7 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_ogt: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_lt_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1073,7 +1073,7 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_ogt: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_lt_f64_e64 s2, 0x40590000, s[2:3] @@ -1083,7 +1083,7 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_ogt: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_lt_f64_e64 s2, 0x40590000, s[2:3] @@ -1096,7 +1096,7 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_ogt: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_lt_f64_e64 s2, 0x40590000, s[2:3] @@ -1111,7 +1111,7 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_oge: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_le_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1123,7 +1123,7 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_oge: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_le_f64_e64 s2, 0x40590000, s[2:3] @@ -1133,7 +1133,7 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_oge: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_le_f64_e64 s2, 0x40590000, s[2:3] @@ -1146,7 +1146,7 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_oge: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_le_f64_e64 s2, 0x40590000, s[2:3] @@ -1161,7 +1161,7 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_olt: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_gt_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1173,7 +1173,7 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_olt: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_gt_f64_e64 s2, 0x40590000, s[2:3] @@ -1183,7 +1183,7 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_olt: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_gt_f64_e64 s2, 0x40590000, s[2:3] @@ -1196,7 +1196,7 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_olt: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_gt_f64_e64 s2, 0x40590000, s[2:3] @@ -1211,7 +1211,7 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_ole: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_ge_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1223,7 +1223,7 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_ole: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_ge_f64_e64 s2, 0x40590000, s[2:3] @@ -1233,7 +1233,7 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_ole: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_ge_f64_e64 s2, 0x40590000, s[2:3] @@ -1246,7 +1246,7 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_ole: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_ge_f64_e64 s2, 0x40590000, s[2:3] @@ -1261,7 +1261,7 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_ueq: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_nlg_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1273,7 +1273,7 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_ueq: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_nlg_f64_e64 s2, 0x40590000, s[2:3] @@ -1283,7 +1283,7 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_ueq: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_nlg_f64_e64 s2, 0x40590000, s[2:3] @@ -1296,7 +1296,7 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_ueq: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_nlg_f64_e64 s2, 0x40590000, s[2:3] @@ -1311,7 +1311,7 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_o: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_o_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1323,7 +1323,7 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_o: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_o_f64_e64 s2, 0x40590000, s[2:3] @@ -1333,7 +1333,7 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_o: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_o_f64_e64 s2, 0x40590000, s[2:3] @@ -1346,7 +1346,7 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_o: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_o_f64_e64 s2, 0x40590000, s[2:3] @@ -1361,7 +1361,7 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_uo: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_u_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1373,7 +1373,7 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_uo: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_u_f64_e64 s2, 0x40590000, s[2:3] @@ -1383,7 +1383,7 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_uo: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_u_f64_e64 s2, 0x40590000, s[2:3] @@ -1396,7 +1396,7 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_uo: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_u_f64_e64 s2, 0x40590000, s[2:3] @@ -1411,7 +1411,7 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_une: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1423,7 +1423,7 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_une: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3] @@ -1433,7 +1433,7 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_une: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3] @@ -1446,7 +1446,7 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_une: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3] @@ -1461,7 +1461,7 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_ugt: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_nge_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1473,7 +1473,7 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_ugt: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_nge_f64_e64 s2, 0x40590000, s[2:3] @@ -1483,7 +1483,7 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_ugt: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_nge_f64_e64 s2, 0x40590000, s[2:3] @@ -1496,7 +1496,7 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_ugt: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_nge_f64_e64 s2, 0x40590000, s[2:3] @@ -1511,7 +1511,7 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_uge: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_ngt_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1523,7 +1523,7 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_uge: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_ngt_f64_e64 s2, 0x40590000, s[2:3] @@ -1533,7 +1533,7 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_uge: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_ngt_f64_e64 s2, 0x40590000, s[2:3] @@ -1546,7 +1546,7 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_uge: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_ngt_f64_e64 s2, 0x40590000, s[2:3] @@ -1561,7 +1561,7 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_ult: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_nle_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1573,7 +1573,7 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_ult: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_nle_f64_e64 s2, 0x40590000, s[2:3] @@ -1583,7 +1583,7 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_ult: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_nle_f64_e64 s2, 0x40590000, s[2:3] @@ -1596,7 +1596,7 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_ult: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_nle_f64_e64 s2, 0x40590000, s[2:3] @@ -1611,7 +1611,7 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_ule: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_nlt_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1623,7 +1623,7 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_ule: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_nlt_f64_e64 s2, 0x40590000, s[2:3] @@ -1633,7 +1633,7 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_ule: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_nlt_f64_e64 s2, 0x40590000, s[2:3] @@ -1646,7 +1646,7 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_ule: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_nlt_f64_e64 s2, 0x40590000, s[2:3] @@ -1663,12 +1663,12 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_with_fabs(ptr addrspace(1) %out, half ; SDAG-GFX11-LABEL: v_fcmp_f16_oeq_with_fabs: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: s_lshr_b32 s3, s2, 16 +; SDAG-GFX11-NEXT: s_lshr_b32 s2, s4, 16 ; SDAG-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; SDAG-GFX11-NEXT: v_cmp_eq_f16_e64 s2, s2, |s3| +; SDAG-GFX11-NEXT: v_cmp_eq_f16_e64 s2, s4, |s2| ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; SDAG-GFX11-NEXT: s_nop 0 @@ -1678,26 +1678,26 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_with_fabs(ptr addrspace(1) %out, half ; SDAG-GFX10-LABEL: v_fcmp_f16_oeq_with_fabs: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: s_lshr_b32 s0, s4, 16 -; SDAG-GFX10-NEXT: v_cmp_eq_f16_e64 s0, s4, |s0| -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: s_lshr_b32 s2, s4, 16 +; SDAG-GFX10-NEXT: v_cmp_eq_f16_e64 s2, s4, |s2| +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_oeq_with_fabs: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: s_lshr_b32 s3, s2, 16 +; GISEL-GFX11-NEXT: s_lshr_b32 s2, s4, 16 ; GISEL-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GISEL-GFX11-NEXT: v_cmp_eq_f16_e64 s2, s2, |s3| +; GISEL-GFX11-NEXT: v_cmp_eq_f16_e64 s2, s4, |s2| ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GISEL-GFX11-NEXT: s_nop 0 @@ -1707,14 +1707,14 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_with_fabs(ptr addrspace(1) %out, half ; GISEL-GFX10-LABEL: v_fcmp_f16_oeq_with_fabs: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: s_lshr_b32 s0, s4, 16 -; GISEL-GFX10-NEXT: v_cmp_eq_f16_e64 s0, s4, |s0| -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: s_lshr_b32 s2, s4, 16 +; GISEL-GFX10-NEXT: v_cmp_eq_f16_e64 s2, s4, |s2| +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %temp = call half @llvm.fabs.f16(half %a) %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half %temp, i32 1) @@ -1727,12 +1727,12 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_both_operands_with_fabs(ptr addrspace( ; SDAG-GFX11-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: s_lshr_b32 s3, s2, 16 +; SDAG-GFX11-NEXT: s_lshr_b32 s2, s4, 16 ; SDAG-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; SDAG-GFX11-NEXT: v_cmp_eq_f16_e64 s2, |s2|, |s3| +; SDAG-GFX11-NEXT: v_cmp_eq_f16_e64 s2, |s4|, |s2| ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; SDAG-GFX11-NEXT: s_nop 0 @@ -1742,26 +1742,26 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_both_operands_with_fabs(ptr addrspace( ; SDAG-GFX10-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: s_lshr_b32 s0, s4, 16 -; SDAG-GFX10-NEXT: v_cmp_eq_f16_e64 s0, |s4|, |s0| -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: s_lshr_b32 s2, s4, 16 +; SDAG-GFX10-NEXT: v_cmp_eq_f16_e64 s2, |s4|, |s2| +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: s_lshr_b32 s3, s2, 16 +; GISEL-GFX11-NEXT: s_lshr_b32 s2, s4, 16 ; GISEL-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GISEL-GFX11-NEXT: v_cmp_eq_f16_e64 s2, |s2|, |s3| +; GISEL-GFX11-NEXT: v_cmp_eq_f16_e64 s2, |s4|, |s2| ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GISEL-GFX11-NEXT: s_nop 0 @@ -1771,14 +1771,14 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_both_operands_with_fabs(ptr addrspace( ; GISEL-GFX10-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: s_lshr_b32 s0, s4, 16 -; GISEL-GFX10-NEXT: v_cmp_eq_f16_e64 s0, |s4|, |s0| -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: s_lshr_b32 s2, s4, 16 +; GISEL-GFX10-NEXT: v_cmp_eq_f16_e64 s2, |s4|, |s2| +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %temp = call half @llvm.fabs.f16(half %a) %src_input = call half @llvm.fabs.f16(half %src) @@ -1798,7 +1798,7 @@ define amdgpu_kernel void @v_fcmp_f16(ptr addrspace(1) %out, half %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f16: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: global_store_b32 v0, v0, s[0:1] @@ -1808,7 +1808,7 @@ define amdgpu_kernel void @v_fcmp_f16(ptr addrspace(1) %out, half %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f16: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: global_store_dword v0, v0, s[0:1] @@ -1823,10 +1823,10 @@ define amdgpu_kernel void @v_fcmp_f16_oeq(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_oeq: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_eq_f16_e64 s2, 0x5640, s2 +; SDAG-GFX11-NEXT: v_cmp_eq_f16_e64 s2, 0x5640, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1837,23 +1837,23 @@ define amdgpu_kernel void @v_fcmp_f16_oeq(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_oeq: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_eq_f16_e64 s0, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_eq_f16_e64 s2, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_oeq: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_eq_f16_e64 s2, 0x5640, s2 +; GISEL-GFX11-NEXT: v_cmp_eq_f16_e64 s2, 0x5640, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1864,13 +1864,13 @@ define amdgpu_kernel void @v_fcmp_f16_oeq(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_oeq: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_eq_f16_e64 s0, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_eq_f16_e64 s2, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 1) store i32 %result, ptr addrspace(1) %out @@ -1882,10 +1882,10 @@ define amdgpu_kernel void @v_fcmp_f16_one(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_one: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s2 +; SDAG-GFX11-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1896,23 +1896,23 @@ define amdgpu_kernel void @v_fcmp_f16_one(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_one: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_neq_f16_e64 s0, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_one: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s2 +; GISEL-GFX11-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1923,13 +1923,13 @@ define amdgpu_kernel void @v_fcmp_f16_one(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_one: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_neq_f16_e64 s0, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 6) store i32 %result, ptr addrspace(1) %out @@ -1941,10 +1941,10 @@ define amdgpu_kernel void @v_fcmp_f16_ogt(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_ogt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_lt_f16_e64 s2, 0x5640, s2 +; SDAG-GFX11-NEXT: v_cmp_lt_f16_e64 s2, 0x5640, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1955,23 +1955,23 @@ define amdgpu_kernel void @v_fcmp_f16_ogt(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_ogt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_lt_f16_e64 s0, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_lt_f16_e64 s2, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_ogt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_lt_f16_e64 s2, 0x5640, s2 +; GISEL-GFX11-NEXT: v_cmp_lt_f16_e64 s2, 0x5640, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1982,13 +1982,13 @@ define amdgpu_kernel void @v_fcmp_f16_ogt(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_ogt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_lt_f16_e64 s0, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_lt_f16_e64 s2, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 2) store i32 %result, ptr addrspace(1) %out @@ -2000,10 +2000,10 @@ define amdgpu_kernel void @v_fcmp_f16_oge(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_oge: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_le_f16_e64 s2, 0x5640, s2 +; SDAG-GFX11-NEXT: v_cmp_le_f16_e64 s2, 0x5640, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2014,23 +2014,23 @@ define amdgpu_kernel void @v_fcmp_f16_oge(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_oge: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_le_f16_e64 s0, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_le_f16_e64 s2, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_oge: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_le_f16_e64 s2, 0x5640, s2 +; GISEL-GFX11-NEXT: v_cmp_le_f16_e64 s2, 0x5640, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2041,13 +2041,13 @@ define amdgpu_kernel void @v_fcmp_f16_oge(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_oge: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_le_f16_e64 s0, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_le_f16_e64 s2, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 3) store i32 %result, ptr addrspace(1) %out @@ -2059,10 +2059,10 @@ define amdgpu_kernel void @v_fcmp_f16_olt(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_olt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_gt_f16_e64 s2, 0x5640, s2 +; SDAG-GFX11-NEXT: v_cmp_gt_f16_e64 s2, 0x5640, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2073,23 +2073,23 @@ define amdgpu_kernel void @v_fcmp_f16_olt(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_olt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_gt_f16_e64 s0, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_gt_f16_e64 s2, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_olt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_gt_f16_e64 s2, 0x5640, s2 +; GISEL-GFX11-NEXT: v_cmp_gt_f16_e64 s2, 0x5640, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2100,13 +2100,13 @@ define amdgpu_kernel void @v_fcmp_f16_olt(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_olt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_gt_f16_e64 s0, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_gt_f16_e64 s2, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 4) store i32 %result, ptr addrspace(1) %out @@ -2118,10 +2118,10 @@ define amdgpu_kernel void @v_fcmp_f16_ole(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_ole: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_ge_f16_e64 s2, 0x5640, s2 +; SDAG-GFX11-NEXT: v_cmp_ge_f16_e64 s2, 0x5640, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2132,23 +2132,23 @@ define amdgpu_kernel void @v_fcmp_f16_ole(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_ole: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ge_f16_e64 s0, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_ge_f16_e64 s2, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_ole: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_ge_f16_e64 s2, 0x5640, s2 +; GISEL-GFX11-NEXT: v_cmp_ge_f16_e64 s2, 0x5640, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2159,13 +2159,13 @@ define amdgpu_kernel void @v_fcmp_f16_ole(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_ole: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ge_f16_e64 s0, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_ge_f16_e64 s2, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 5) store i32 %result, ptr addrspace(1) %out @@ -2177,10 +2177,10 @@ define amdgpu_kernel void @v_fcmp_f16_ueq(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_ueq: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_nlg_f16_e64 s2, 0x5640, s2 +; SDAG-GFX11-NEXT: v_cmp_nlg_f16_e64 s2, 0x5640, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2191,23 +2191,23 @@ define amdgpu_kernel void @v_fcmp_f16_ueq(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_ueq: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_nlg_f16_e64 s0, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_nlg_f16_e64 s2, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_ueq: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_nlg_f16_e64 s2, 0x5640, s2 +; GISEL-GFX11-NEXT: v_cmp_nlg_f16_e64 s2, 0x5640, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2218,13 +2218,13 @@ define amdgpu_kernel void @v_fcmp_f16_ueq(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_ueq: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_nlg_f16_e64 s0, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_nlg_f16_e64 s2, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 9) store i32 %result, ptr addrspace(1) %out @@ -2236,10 +2236,10 @@ define amdgpu_kernel void @v_fcmp_f16_une(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_une: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s2 +; SDAG-GFX11-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2250,23 +2250,23 @@ define amdgpu_kernel void @v_fcmp_f16_une(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_une: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_neq_f16_e64 s0, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_une: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s2 +; GISEL-GFX11-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2277,13 +2277,13 @@ define amdgpu_kernel void @v_fcmp_f16_une(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_une: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_neq_f16_e64 s0, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 14) store i32 %result, ptr addrspace(1) %out @@ -2295,10 +2295,10 @@ define amdgpu_kernel void @v_fcmp_f16_ugt(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_ugt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_nge_f16_e64 s2, 0x5640, s2 +; SDAG-GFX11-NEXT: v_cmp_nge_f16_e64 s2, 0x5640, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2309,23 +2309,23 @@ define amdgpu_kernel void @v_fcmp_f16_ugt(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_ugt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_nge_f16_e64 s0, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_nge_f16_e64 s2, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_ugt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_nge_f16_e64 s2, 0x5640, s2 +; GISEL-GFX11-NEXT: v_cmp_nge_f16_e64 s2, 0x5640, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2336,13 +2336,13 @@ define amdgpu_kernel void @v_fcmp_f16_ugt(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_ugt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_nge_f16_e64 s0, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_nge_f16_e64 s2, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 10) store i32 %result, ptr addrspace(1) %out @@ -2354,10 +2354,10 @@ define amdgpu_kernel void @v_fcmp_f16_uge(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_uge: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_ngt_f16_e64 s2, 0x5640, s2 +; SDAG-GFX11-NEXT: v_cmp_ngt_f16_e64 s2, 0x5640, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2368,23 +2368,23 @@ define amdgpu_kernel void @v_fcmp_f16_uge(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_uge: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ngt_f16_e64 s0, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_ngt_f16_e64 s2, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_uge: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_ngt_f16_e64 s2, 0x5640, s2 +; GISEL-GFX11-NEXT: v_cmp_ngt_f16_e64 s2, 0x5640, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2395,13 +2395,13 @@ define amdgpu_kernel void @v_fcmp_f16_uge(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_uge: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ngt_f16_e64 s0, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_ngt_f16_e64 s2, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 11) store i32 %result, ptr addrspace(1) %out @@ -2413,10 +2413,10 @@ define amdgpu_kernel void @v_fcmp_f16_ult(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_ult: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_nle_f16_e64 s2, 0x5640, s2 +; SDAG-GFX11-NEXT: v_cmp_nle_f16_e64 s2, 0x5640, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2427,23 +2427,23 @@ define amdgpu_kernel void @v_fcmp_f16_ult(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_ult: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_nle_f16_e64 s0, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_nle_f16_e64 s2, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_ult: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_nle_f16_e64 s2, 0x5640, s2 +; GISEL-GFX11-NEXT: v_cmp_nle_f16_e64 s2, 0x5640, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2454,13 +2454,13 @@ define amdgpu_kernel void @v_fcmp_f16_ult(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_ult: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_nle_f16_e64 s0, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_nle_f16_e64 s2, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 12) store i32 %result, ptr addrspace(1) %out @@ -2471,10 +2471,10 @@ define amdgpu_kernel void @v_fcmp_f16_o(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_o: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_o_f16_e64 s2, 0x5640, s2 +; SDAG-GFX11-NEXT: v_cmp_o_f16_e64 s2, 0x5640, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2485,23 +2485,23 @@ define amdgpu_kernel void @v_fcmp_f16_o(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_o: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_o_f16_e64 s0, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_o_f16_e64 s2, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_o: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_o_f16_e64 s2, 0x5640, s2 +; GISEL-GFX11-NEXT: v_cmp_o_f16_e64 s2, 0x5640, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2512,13 +2512,13 @@ define amdgpu_kernel void @v_fcmp_f16_o(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_o: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_o_f16_e64 s0, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_o_f16_e64 s2, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 7) store i32 %result, ptr addrspace(1) %out @@ -2529,10 +2529,10 @@ define amdgpu_kernel void @v_fcmp_f16_uo(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_uo: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_u_f16_e64 s2, 0x5640, s2 +; SDAG-GFX11-NEXT: v_cmp_u_f16_e64 s2, 0x5640, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2543,23 +2543,23 @@ define amdgpu_kernel void @v_fcmp_f16_uo(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_uo: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_u_f16_e64 s0, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_u_f16_e64 s2, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_uo: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_u_f16_e64 s2, 0x5640, s2 +; GISEL-GFX11-NEXT: v_cmp_u_f16_e64 s2, 0x5640, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2570,13 +2570,13 @@ define amdgpu_kernel void @v_fcmp_f16_uo(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_uo: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_u_f16_e64 s0, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_u_f16_e64 s2, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 8) store i32 %result, ptr addrspace(1) %out @@ -2587,10 +2587,10 @@ define amdgpu_kernel void @v_fcmp_f16_ule(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_ule: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_nlt_f16_e64 s2, 0x5640, s2 +; SDAG-GFX11-NEXT: v_cmp_nlt_f16_e64 s2, 0x5640, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2601,23 +2601,23 @@ define amdgpu_kernel void @v_fcmp_f16_ule(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_ule: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_nlt_f16_e64 s0, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_nlt_f16_e64 s2, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_ule: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_nlt_f16_e64 s2, 0x5640, s2 +; GISEL-GFX11-NEXT: v_cmp_nlt_f16_e64 s2, 0x5640, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2628,13 +2628,13 @@ define amdgpu_kernel void @v_fcmp_f16_ule(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_ule: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_nlt_f16_e64 s0, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_nlt_f16_e64 s2, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 13) store i32 %result, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll index 674fec1b865a6..734d1472d054f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll @@ -16,7 +16,7 @@ declare half @llvm.fabs.f16(half) #0 define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float %src, float %a) { ; GFX11-LABEL: v_fcmp_f32_oeq_with_fabs: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, |s3| @@ -30,7 +30,7 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float ; ; GFX9-LABEL: v_fcmp_f32_oeq_with_fabs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s7 @@ -42,7 +42,7 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float ; ; VI-SDAG-LABEL: v_fcmp_f32_oeq_with_fabs: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s3 ; VI-SDAG-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, |v0| @@ -55,7 +55,7 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float ; ; VI-GISEL-LABEL: v_fcmp_f32_oeq_with_fabs: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s3 ; VI-GISEL-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, |v0| @@ -74,7 +74,7 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace(1) %out, float %src, float %a) { ; GFX11-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_f32_e64 s[2:3], |s2|, |s3| @@ -88,7 +88,7 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace( ; ; GFX9-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s7 @@ -100,7 +100,7 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace( ; ; VI-SDAG-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s3 ; VI-SDAG-NEXT: v_cmp_eq_f32_e64 s[2:3], |s2|, |v0| @@ -113,7 +113,7 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace( ; ; VI-GISEL-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s3 ; VI-GISEL-NEXT: v_cmp_eq_f32_e64 s[2:3], |s2|, |v0| @@ -137,7 +137,7 @@ define amdgpu_kernel void @v_fcmp_f32(ptr addrspace(1) %out, float %src) { ; ; GFX11-GISEL-LABEL: v_fcmp_f32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -151,7 +151,7 @@ define amdgpu_kernel void @v_fcmp_f32(ptr addrspace(1) %out, float %src) { ; ; GFX9-GISEL-LABEL: v_fcmp_f32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1] @@ -163,7 +163,7 @@ define amdgpu_kernel void @v_fcmp_f32(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 @@ -178,11 +178,11 @@ define amdgpu_kernel void @v_fcmp_f32_oeq(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_oeq: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_eq_f32_e64 s[2:3], 0x42c80000, s2 +; GFX11-NEXT: v_cmp_eq_f32_e64 s[2:3], 0x42c80000, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -193,24 +193,24 @@ define amdgpu_kernel void @v_fcmp_f32_oeq(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_oeq: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_f32_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_eq_f32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_oeq: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, v0 +; VI-SDAG-NEXT: v_cmp_eq_f32_e64 s[2:3], s4, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -220,11 +220,11 @@ define amdgpu_kernel void @v_fcmp_f32_oeq(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_oeq: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, v0 +; VI-GISEL-NEXT: v_cmp_eq_f32_e64 s[2:3], s4, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -240,11 +240,11 @@ define amdgpu_kernel void @v_fcmp_f32_one(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_one: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_neq_f32_e64 s[2:3], 0x42c80000, s2 +; GFX11-NEXT: v_cmp_neq_f32_e64 s[2:3], 0x42c80000, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -255,24 +255,24 @@ define amdgpu_kernel void @v_fcmp_f32_one(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_one: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_neq_f32_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_neq_f32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_one: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_neq_f32_e64 s[2:3], s2, v0 +; VI-SDAG-NEXT: v_cmp_neq_f32_e64 s[2:3], s4, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -282,11 +282,11 @@ define amdgpu_kernel void @v_fcmp_f32_one(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_one: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_neq_f32_e64 s[2:3], s2, v0 +; VI-GISEL-NEXT: v_cmp_neq_f32_e64 s[2:3], s4, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -302,11 +302,11 @@ define amdgpu_kernel void @v_fcmp_f32_ogt(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_ogt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_f32_e64 s[2:3], 0x42c80000, s2 +; GFX11-NEXT: v_cmp_lt_f32_e64 s[2:3], 0x42c80000, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -317,24 +317,24 @@ define amdgpu_kernel void @v_fcmp_f32_ogt(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_ogt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_f32_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_gt_f32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_ogt: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_gt_f32_e64 s[2:3], s2, v0 +; VI-SDAG-NEXT: v_cmp_gt_f32_e64 s[2:3], s4, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -344,11 +344,11 @@ define amdgpu_kernel void @v_fcmp_f32_ogt(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_ogt: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_gt_f32_e64 s[2:3], s2, v0 +; VI-GISEL-NEXT: v_cmp_gt_f32_e64 s[2:3], s4, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -364,11 +364,11 @@ define amdgpu_kernel void @v_fcmp_f32_oge(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_oge: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_le_f32_e64 s[2:3], 0x42c80000, s2 +; GFX11-NEXT: v_cmp_le_f32_e64 s[2:3], 0x42c80000, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -379,24 +379,24 @@ define amdgpu_kernel void @v_fcmp_f32_oge(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_oge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_oge: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_ge_f32_e64 s[2:3], s2, v0 +; VI-SDAG-NEXT: v_cmp_ge_f32_e64 s[2:3], s4, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -406,11 +406,11 @@ define amdgpu_kernel void @v_fcmp_f32_oge(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_oge: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_ge_f32_e64 s[2:3], s2, v0 +; VI-GISEL-NEXT: v_cmp_ge_f32_e64 s[2:3], s4, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -426,11 +426,11 @@ define amdgpu_kernel void @v_fcmp_f32_olt(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_olt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_f32_e64 s[2:3], 0x42c80000, s2 +; GFX11-NEXT: v_cmp_gt_f32_e64 s[2:3], 0x42c80000, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -441,24 +441,24 @@ define amdgpu_kernel void @v_fcmp_f32_olt(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_olt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_f32_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_lt_f32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_olt: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], s2, v0 +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], s4, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -468,11 +468,11 @@ define amdgpu_kernel void @v_fcmp_f32_olt(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_olt: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], s2, v0 +; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], s4, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -488,11 +488,11 @@ define amdgpu_kernel void @v_fcmp_f32_ole(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_ole: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_ge_f32_e64 s[2:3], 0x42c80000, s2 +; GFX11-NEXT: v_cmp_ge_f32_e64 s[2:3], 0x42c80000, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -503,24 +503,24 @@ define amdgpu_kernel void @v_fcmp_f32_ole(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_ole: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_le_f32_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_le_f32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_ole: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_le_f32_e64 s[2:3], s2, v0 +; VI-SDAG-NEXT: v_cmp_le_f32_e64 s[2:3], s4, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -530,11 +530,11 @@ define amdgpu_kernel void @v_fcmp_f32_ole(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_ole: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_le_f32_e64 s[2:3], s2, v0 +; VI-GISEL-NEXT: v_cmp_le_f32_e64 s[2:3], s4, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -550,11 +550,11 @@ define amdgpu_kernel void @v_fcmp_f32_o(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_o: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_o_f32_e64 s[2:3], 0x42c80000, s2 +; GFX11-NEXT: v_cmp_o_f32_e64 s[2:3], 0x42c80000, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -565,24 +565,24 @@ define amdgpu_kernel void @v_fcmp_f32_o(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_o: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_o_f32_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_o_f32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_o: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_o_f32_e64 s[2:3], s2, v0 +; VI-SDAG-NEXT: v_cmp_o_f32_e64 s[2:3], s4, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -592,11 +592,11 @@ define amdgpu_kernel void @v_fcmp_f32_o(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_o: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_o_f32_e64 s[2:3], s2, v0 +; VI-GISEL-NEXT: v_cmp_o_f32_e64 s[2:3], s4, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -612,11 +612,11 @@ define amdgpu_kernel void @v_fcmp_f32_uo(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_uo: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_u_f32_e64 s[2:3], 0x42c80000, s2 +; GFX11-NEXT: v_cmp_u_f32_e64 s[2:3], 0x42c80000, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -627,24 +627,24 @@ define amdgpu_kernel void @v_fcmp_f32_uo(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_uo: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_u_f32_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_u_f32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_uo: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_u_f32_e64 s[2:3], s2, v0 +; VI-SDAG-NEXT: v_cmp_u_f32_e64 s[2:3], s4, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -654,11 +654,11 @@ define amdgpu_kernel void @v_fcmp_f32_uo(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_uo: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_u_f32_e64 s[2:3], s2, v0 +; VI-GISEL-NEXT: v_cmp_u_f32_e64 s[2:3], s4, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -674,11 +674,11 @@ define amdgpu_kernel void @v_fcmp_f32_ueq(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_ueq: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nlg_f32_e64 s[2:3], 0x42c80000, s2 +; GFX11-NEXT: v_cmp_nlg_f32_e64 s[2:3], 0x42c80000, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -689,24 +689,24 @@ define amdgpu_kernel void @v_fcmp_f32_ueq(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_ueq: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_nlg_f32_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_nlg_f32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_ueq: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_nlg_f32_e64 s[2:3], s2, v0 +; VI-SDAG-NEXT: v_cmp_nlg_f32_e64 s[2:3], s4, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -716,11 +716,11 @@ define amdgpu_kernel void @v_fcmp_f32_ueq(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_ueq: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_nlg_f32_e64 s[2:3], s2, v0 +; VI-GISEL-NEXT: v_cmp_nlg_f32_e64 s[2:3], s4, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -736,11 +736,11 @@ define amdgpu_kernel void @v_fcmp_f32_une(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_une: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_neq_f32_e64 s[2:3], 0x42c80000, s2 +; GFX11-NEXT: v_cmp_neq_f32_e64 s[2:3], 0x42c80000, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -751,24 +751,24 @@ define amdgpu_kernel void @v_fcmp_f32_une(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_une: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_neq_f32_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_neq_f32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_une: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_neq_f32_e64 s[2:3], s2, v0 +; VI-SDAG-NEXT: v_cmp_neq_f32_e64 s[2:3], s4, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -778,11 +778,11 @@ define amdgpu_kernel void @v_fcmp_f32_une(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_une: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_neq_f32_e64 s[2:3], s2, v0 +; VI-GISEL-NEXT: v_cmp_neq_f32_e64 s[2:3], s4, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -798,11 +798,11 @@ define amdgpu_kernel void @v_fcmp_f32_ugt(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_ugt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nge_f32_e64 s[2:3], 0x42c80000, s2 +; GFX11-NEXT: v_cmp_nge_f32_e64 s[2:3], 0x42c80000, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -813,24 +813,24 @@ define amdgpu_kernel void @v_fcmp_f32_ugt(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_ugt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_nle_f32_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_nle_f32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_ugt: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_nle_f32_e64 s[2:3], s2, v0 +; VI-SDAG-NEXT: v_cmp_nle_f32_e64 s[2:3], s4, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -840,11 +840,11 @@ define amdgpu_kernel void @v_fcmp_f32_ugt(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_ugt: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_nle_f32_e64 s[2:3], s2, v0 +; VI-GISEL-NEXT: v_cmp_nle_f32_e64 s[2:3], s4, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -860,11 +860,11 @@ define amdgpu_kernel void @v_fcmp_f32_uge(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_uge: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_ngt_f32_e64 s[2:3], 0x42c80000, s2 +; GFX11-NEXT: v_cmp_ngt_f32_e64 s[2:3], 0x42c80000, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -875,24 +875,24 @@ define amdgpu_kernel void @v_fcmp_f32_uge(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_uge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_nlt_f32_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_nlt_f32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_uge: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_nlt_f32_e64 s[2:3], s2, v0 +; VI-SDAG-NEXT: v_cmp_nlt_f32_e64 s[2:3], s4, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -902,11 +902,11 @@ define amdgpu_kernel void @v_fcmp_f32_uge(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_uge: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_nlt_f32_e64 s[2:3], s2, v0 +; VI-GISEL-NEXT: v_cmp_nlt_f32_e64 s[2:3], s4, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -922,11 +922,11 @@ define amdgpu_kernel void @v_fcmp_f32_ult(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_ult: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nle_f32_e64 s[2:3], 0x42c80000, s2 +; GFX11-NEXT: v_cmp_nle_f32_e64 s[2:3], 0x42c80000, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -937,24 +937,24 @@ define amdgpu_kernel void @v_fcmp_f32_ult(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_ult: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_nge_f32_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_nge_f32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_ult: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_nge_f32_e64 s[2:3], s2, v0 +; VI-SDAG-NEXT: v_cmp_nge_f32_e64 s[2:3], s4, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -964,11 +964,11 @@ define amdgpu_kernel void @v_fcmp_f32_ult(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_ult: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_nge_f32_e64 s[2:3], s2, v0 +; VI-GISEL-NEXT: v_cmp_nge_f32_e64 s[2:3], s4, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -984,11 +984,11 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_ule: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nlt_f32_e64 s[2:3], 0x42c80000, s2 +; GFX11-NEXT: v_cmp_nlt_f32_e64 s[2:3], 0x42c80000, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -999,24 +999,24 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_ule: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ngt_f32_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_ngt_f32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_ule: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_ngt_f32_e64 s[2:3], s2, v0 +; VI-SDAG-NEXT: v_cmp_ngt_f32_e64 s[2:3], s4, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -1026,11 +1026,11 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_ule: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_ngt_f32_e64 s[2:3], s2, v0 +; VI-GISEL-NEXT: v_cmp_ngt_f32_e64 s[2:3], s4, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -1045,7 +1045,7 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) { define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_oeq: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1059,7 +1059,7 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_oeq: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1072,7 +1072,7 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_oeq: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1086,7 +1086,7 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_oeq: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1105,7 +1105,7 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_one: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_neq_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1119,7 +1119,7 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_one: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1132,7 +1132,7 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_one: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1146,7 +1146,7 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_one: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1165,7 +1165,7 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_ogt: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_lt_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1179,7 +1179,7 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_ogt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1192,7 +1192,7 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_ogt: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1206,7 +1206,7 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_ogt: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1225,7 +1225,7 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_oge: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_le_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1239,7 +1239,7 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_oge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1252,7 +1252,7 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_oge: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1266,7 +1266,7 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_oge: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1285,7 +1285,7 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_olt: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_gt_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1299,7 +1299,7 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_olt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1312,7 +1312,7 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_olt: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1326,7 +1326,7 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_olt: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1345,7 +1345,7 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_ole: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_ge_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1359,7 +1359,7 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_ole: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1372,7 +1372,7 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_ole: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1386,7 +1386,7 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_ole: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1405,7 +1405,7 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_ueq: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_nlg_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1419,7 +1419,7 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_ueq: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1432,7 +1432,7 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_ueq: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1446,7 +1446,7 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_ueq: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1465,7 +1465,7 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_o: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_o_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1479,7 +1479,7 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_o: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1492,7 +1492,7 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_o: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1506,7 +1506,7 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_o: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1525,7 +1525,7 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_uo: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_u_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1539,7 +1539,7 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_uo: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1552,7 +1552,7 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_uo: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1566,7 +1566,7 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_uo: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1585,7 +1585,7 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_une: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_neq_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1599,7 +1599,7 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_une: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1612,7 +1612,7 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_une: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1626,7 +1626,7 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_une: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1645,7 +1645,7 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_ugt: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_nge_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1659,7 +1659,7 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_ugt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1672,7 +1672,7 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_ugt: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1686,7 +1686,7 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_ugt: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1705,7 +1705,7 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_uge: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_ngt_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1719,7 +1719,7 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_uge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1732,7 +1732,7 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_uge: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1746,7 +1746,7 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_uge: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1765,7 +1765,7 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_ult: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_nle_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1779,7 +1779,7 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_ult: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1792,7 +1792,7 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_ult: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1806,7 +1806,7 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_ult: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1825,7 +1825,7 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_ule: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_nlt_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1839,7 +1839,7 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_ule: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1852,7 +1852,7 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_ule: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1866,7 +1866,7 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_ule: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1887,13 +1887,13 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_with_fabs(ptr addrspace(1) %out, half ; GFX11-LABEL: v_fcmp_f16_oeq_with_fabs: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s3, s2, 16 +; GFX11-NEXT: s_lshr_b32 s2, s4, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cmp_eq_f16_e64 s[2:3], s2, |s3| +; GFX11-NEXT: v_cmp_eq_f16_e64 s[2:3], s4, |s2| ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1904,26 +1904,26 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_with_fabs(ptr addrspace(1) %out, half ; ; GFX9-LABEL: v_fcmp_f16_oeq_with_fabs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s0, s4, 16 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_cmp_eq_f16_e64 s[0:1], s4, |v0| -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: s_lshr_b32 s2, s4, 16 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_cmp_eq_f16_e64 s[2:3], s4, |v0| +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_oeq_with_fabs: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: s_lshr_b32 s3, s2, 16 -; VI-SDAG-NEXT: v_mov_b32_e32 v0, s3 -; VI-SDAG-NEXT: v_cmp_eq_f16_e64 s[2:3], s2, |v0| +; VI-SDAG-NEXT: s_lshr_b32 s2, s4, 16 +; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; VI-SDAG-NEXT: v_cmp_eq_f16_e64 s[2:3], s4, |v0| ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -1933,12 +1933,12 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_with_fabs(ptr addrspace(1) %out, half ; ; VI-GISEL-LABEL: v_fcmp_f16_oeq_with_fabs: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: s_lshr_b32 s3, s2, 16 -; VI-GISEL-NEXT: v_mov_b32_e32 v0, s3 -; VI-GISEL-NEXT: v_cmp_eq_f16_e64 s[2:3], s2, |v0| +; VI-GISEL-NEXT: s_lshr_b32 s2, s4, 16 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: v_cmp_eq_f16_e64 s[2:3], s4, |v0| ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -1956,13 +1956,13 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_both_operands_with_fabs(ptr addrspace( ; GFX11-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s3, s2, 16 +; GFX11-NEXT: s_lshr_b32 s2, s4, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cmp_eq_f16_e64 s[2:3], |s2|, |s3| +; GFX11-NEXT: v_cmp_eq_f16_e64 s[2:3], |s4|, |s2| ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1973,26 +1973,26 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_both_operands_with_fabs(ptr addrspace( ; ; GFX9-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s0, s4, 16 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_cmp_eq_f16_e64 s[0:1], |s4|, |v0| -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: s_lshr_b32 s2, s4, 16 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_cmp_eq_f16_e64 s[2:3], |s4|, |v0| +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: s_lshr_b32 s3, s2, 16 -; VI-SDAG-NEXT: v_mov_b32_e32 v0, s3 -; VI-SDAG-NEXT: v_cmp_eq_f16_e64 s[2:3], |s2|, |v0| +; VI-SDAG-NEXT: s_lshr_b32 s2, s4, 16 +; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; VI-SDAG-NEXT: v_cmp_eq_f16_e64 s[2:3], |s4|, |v0| ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2002,12 +2002,12 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_both_operands_with_fabs(ptr addrspace( ; ; VI-GISEL-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: s_lshr_b32 s3, s2, 16 -; VI-GISEL-NEXT: v_mov_b32_e32 v0, s3 -; VI-GISEL-NEXT: v_cmp_eq_f16_e64 s[2:3], |s2|, |v0| +; VI-GISEL-NEXT: s_lshr_b32 s2, s4, 16 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: v_cmp_eq_f16_e64 s[2:3], |s4|, |v0| ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2028,7 +2028,7 @@ define amdgpu_kernel void @v_fcmp_f16(ptr addrspace(1) %out, half %src) { ; ; GFX11-GISEL-LABEL: v_fcmp_f16: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -2042,7 +2042,7 @@ define amdgpu_kernel void @v_fcmp_f16(ptr addrspace(1) %out, half %src) { ; ; GFX9-GISEL-LABEL: v_fcmp_f16: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1] @@ -2054,7 +2054,7 @@ define amdgpu_kernel void @v_fcmp_f16(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 @@ -2070,11 +2070,11 @@ define amdgpu_kernel void @v_fcmp_f16_oeq(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_oeq: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_eq_f16_e64 s[2:3], 0x5640, s2 +; GFX11-NEXT: v_cmp_eq_f16_e64 s[2:3], 0x5640, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2085,24 +2085,24 @@ define amdgpu_kernel void @v_fcmp_f16_oeq(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_oeq: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_f16_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_eq_f16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_oeq: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_eq_f16_e64 s[2:3], s2, v0 +; VI-SDAG-NEXT: v_cmp_eq_f16_e64 s[2:3], s4, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2112,11 +2112,11 @@ define amdgpu_kernel void @v_fcmp_f16_oeq(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_oeq: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_eq_f16_e64 s[2:3], s2, v0 +; VI-GISEL-NEXT: v_cmp_eq_f16_e64 s[2:3], s4, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2133,11 +2133,11 @@ define amdgpu_kernel void @v_fcmp_f16_one(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_one: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_neq_f16_e64 s[2:3], 0x5640, s2 +; GFX11-NEXT: v_cmp_neq_f16_e64 s[2:3], 0x5640, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2148,24 +2148,24 @@ define amdgpu_kernel void @v_fcmp_f16_one(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_one: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_neq_f16_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_neq_f16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_one: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_neq_f16_e64 s[2:3], s2, v0 +; VI-SDAG-NEXT: v_cmp_neq_f16_e64 s[2:3], s4, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2175,11 +2175,11 @@ define amdgpu_kernel void @v_fcmp_f16_one(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_one: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_neq_f16_e64 s[2:3], s2, v0 +; VI-GISEL-NEXT: v_cmp_neq_f16_e64 s[2:3], s4, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2196,11 +2196,11 @@ define amdgpu_kernel void @v_fcmp_f16_ogt(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_ogt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_f16_e64 s[2:3], 0x5640, s2 +; GFX11-NEXT: v_cmp_lt_f16_e64 s[2:3], 0x5640, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2211,24 +2211,24 @@ define amdgpu_kernel void @v_fcmp_f16_ogt(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_ogt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_f16_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_gt_f16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_ogt: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_gt_f16_e64 s[2:3], s2, v0 +; VI-SDAG-NEXT: v_cmp_gt_f16_e64 s[2:3], s4, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2238,11 +2238,11 @@ define amdgpu_kernel void @v_fcmp_f16_ogt(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_ogt: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_gt_f16_e64 s[2:3], s2, v0 +; VI-GISEL-NEXT: v_cmp_gt_f16_e64 s[2:3], s4, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2259,11 +2259,11 @@ define amdgpu_kernel void @v_fcmp_f16_oge(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_oge: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_le_f16_e64 s[2:3], 0x5640, s2 +; GFX11-NEXT: v_cmp_le_f16_e64 s[2:3], 0x5640, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2274,24 +2274,24 @@ define amdgpu_kernel void @v_fcmp_f16_oge(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_oge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ge_f16_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_ge_f16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_oge: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_ge_f16_e64 s[2:3], s2, v0 +; VI-SDAG-NEXT: v_cmp_ge_f16_e64 s[2:3], s4, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2301,11 +2301,11 @@ define amdgpu_kernel void @v_fcmp_f16_oge(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_oge: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_ge_f16_e64 s[2:3], s2, v0 +; VI-GISEL-NEXT: v_cmp_ge_f16_e64 s[2:3], s4, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2322,11 +2322,11 @@ define amdgpu_kernel void @v_fcmp_f16_olt(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_olt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_f16_e64 s[2:3], 0x5640, s2 +; GFX11-NEXT: v_cmp_gt_f16_e64 s[2:3], 0x5640, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2337,24 +2337,24 @@ define amdgpu_kernel void @v_fcmp_f16_olt(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_olt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_f16_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_lt_f16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_olt: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_lt_f16_e64 s[2:3], s2, v0 +; VI-SDAG-NEXT: v_cmp_lt_f16_e64 s[2:3], s4, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2364,11 +2364,11 @@ define amdgpu_kernel void @v_fcmp_f16_olt(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_olt: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_lt_f16_e64 s[2:3], s2, v0 +; VI-GISEL-NEXT: v_cmp_lt_f16_e64 s[2:3], s4, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2385,11 +2385,11 @@ define amdgpu_kernel void @v_fcmp_f16_ole(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_ole: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_ge_f16_e64 s[2:3], 0x5640, s2 +; GFX11-NEXT: v_cmp_ge_f16_e64 s[2:3], 0x5640, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2400,24 +2400,24 @@ define amdgpu_kernel void @v_fcmp_f16_ole(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_ole: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_le_f16_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_le_f16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_ole: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_le_f16_e64 s[2:3], s2, v0 +; VI-SDAG-NEXT: v_cmp_le_f16_e64 s[2:3], s4, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2427,11 +2427,11 @@ define amdgpu_kernel void @v_fcmp_f16_ole(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_ole: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_le_f16_e64 s[2:3], s2, v0 +; VI-GISEL-NEXT: v_cmp_le_f16_e64 s[2:3], s4, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2448,11 +2448,11 @@ define amdgpu_kernel void @v_fcmp_f16_ueq(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_ueq: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nlg_f16_e64 s[2:3], 0x5640, s2 +; GFX11-NEXT: v_cmp_nlg_f16_e64 s[2:3], 0x5640, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2463,24 +2463,24 @@ define amdgpu_kernel void @v_fcmp_f16_ueq(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_ueq: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_nlg_f16_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_nlg_f16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_ueq: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_nlg_f16_e64 s[2:3], s2, v0 +; VI-SDAG-NEXT: v_cmp_nlg_f16_e64 s[2:3], s4, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2490,11 +2490,11 @@ define amdgpu_kernel void @v_fcmp_f16_ueq(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_ueq: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_nlg_f16_e64 s[2:3], s2, v0 +; VI-GISEL-NEXT: v_cmp_nlg_f16_e64 s[2:3], s4, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2511,11 +2511,11 @@ define amdgpu_kernel void @v_fcmp_f16_une(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_une: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_neq_f16_e64 s[2:3], 0x5640, s2 +; GFX11-NEXT: v_cmp_neq_f16_e64 s[2:3], 0x5640, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2526,24 +2526,24 @@ define amdgpu_kernel void @v_fcmp_f16_une(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_une: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_neq_f16_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_neq_f16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_une: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_neq_f16_e64 s[2:3], s2, v0 +; VI-SDAG-NEXT: v_cmp_neq_f16_e64 s[2:3], s4, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2553,11 +2553,11 @@ define amdgpu_kernel void @v_fcmp_f16_une(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_une: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_neq_f16_e64 s[2:3], s2, v0 +; VI-GISEL-NEXT: v_cmp_neq_f16_e64 s[2:3], s4, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2574,11 +2574,11 @@ define amdgpu_kernel void @v_fcmp_f16_ugt(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_ugt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nge_f16_e64 s[2:3], 0x5640, s2 +; GFX11-NEXT: v_cmp_nge_f16_e64 s[2:3], 0x5640, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2589,24 +2589,24 @@ define amdgpu_kernel void @v_fcmp_f16_ugt(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_ugt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_nle_f16_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_nle_f16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_ugt: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_nle_f16_e64 s[2:3], s2, v0 +; VI-SDAG-NEXT: v_cmp_nle_f16_e64 s[2:3], s4, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2616,11 +2616,11 @@ define amdgpu_kernel void @v_fcmp_f16_ugt(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_ugt: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_nle_f16_e64 s[2:3], s2, v0 +; VI-GISEL-NEXT: v_cmp_nle_f16_e64 s[2:3], s4, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2637,11 +2637,11 @@ define amdgpu_kernel void @v_fcmp_f16_uge(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_uge: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_ngt_f16_e64 s[2:3], 0x5640, s2 +; GFX11-NEXT: v_cmp_ngt_f16_e64 s[2:3], 0x5640, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2652,24 +2652,24 @@ define amdgpu_kernel void @v_fcmp_f16_uge(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_uge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_nlt_f16_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_nlt_f16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_uge: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_nlt_f16_e64 s[2:3], s2, v0 +; VI-SDAG-NEXT: v_cmp_nlt_f16_e64 s[2:3], s4, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2679,11 +2679,11 @@ define amdgpu_kernel void @v_fcmp_f16_uge(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_uge: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_nlt_f16_e64 s[2:3], s2, v0 +; VI-GISEL-NEXT: v_cmp_nlt_f16_e64 s[2:3], s4, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2700,11 +2700,11 @@ define amdgpu_kernel void @v_fcmp_f16_ult(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_ult: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nle_f16_e64 s[2:3], 0x5640, s2 +; GFX11-NEXT: v_cmp_nle_f16_e64 s[2:3], 0x5640, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2715,24 +2715,24 @@ define amdgpu_kernel void @v_fcmp_f16_ult(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_ult: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_nge_f16_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_nge_f16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_ult: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_nge_f16_e64 s[2:3], s2, v0 +; VI-SDAG-NEXT: v_cmp_nge_f16_e64 s[2:3], s4, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2742,11 +2742,11 @@ define amdgpu_kernel void @v_fcmp_f16_ult(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_ult: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_nge_f16_e64 s[2:3], s2, v0 +; VI-GISEL-NEXT: v_cmp_nge_f16_e64 s[2:3], s4, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2762,11 +2762,11 @@ define amdgpu_kernel void @v_fcmp_f16_o(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_o: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_o_f16_e64 s[2:3], 0x5640, s2 +; GFX11-NEXT: v_cmp_o_f16_e64 s[2:3], 0x5640, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2777,24 +2777,24 @@ define amdgpu_kernel void @v_fcmp_f16_o(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_o: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_o_f16_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_o_f16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_o: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_o_f16_e64 s[2:3], s2, v0 +; VI-SDAG-NEXT: v_cmp_o_f16_e64 s[2:3], s4, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2804,11 +2804,11 @@ define amdgpu_kernel void @v_fcmp_f16_o(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_o: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_o_f16_e64 s[2:3], s2, v0 +; VI-GISEL-NEXT: v_cmp_o_f16_e64 s[2:3], s4, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2824,11 +2824,11 @@ define amdgpu_kernel void @v_fcmp_f16_uo(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_uo: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_u_f16_e64 s[2:3], 0x5640, s2 +; GFX11-NEXT: v_cmp_u_f16_e64 s[2:3], 0x5640, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2839,24 +2839,24 @@ define amdgpu_kernel void @v_fcmp_f16_uo(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_uo: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_u_f16_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_u_f16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_uo: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_u_f16_e64 s[2:3], s2, v0 +; VI-SDAG-NEXT: v_cmp_u_f16_e64 s[2:3], s4, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2866,11 +2866,11 @@ define amdgpu_kernel void @v_fcmp_f16_uo(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_uo: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_u_f16_e64 s[2:3], s2, v0 +; VI-GISEL-NEXT: v_cmp_u_f16_e64 s[2:3], s4, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2886,11 +2886,11 @@ define amdgpu_kernel void @v_fcmp_f16_ule(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_ule: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nlt_f16_e64 s[2:3], 0x5640, s2 +; GFX11-NEXT: v_cmp_nlt_f16_e64 s[2:3], 0x5640, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2901,24 +2901,24 @@ define amdgpu_kernel void @v_fcmp_f16_ule(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_ule: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ngt_f16_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_ngt_f16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_ule: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_ngt_f16_e64 s[2:3], s2, v0 +; VI-SDAG-NEXT: v_cmp_ngt_f16_e64 s[2:3], s4, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2928,11 +2928,11 @@ define amdgpu_kernel void @v_fcmp_f16_ule(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_ule: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_ngt_f16_e64 s[2:3], s2, v0 +; VI-GISEL-NEXT: v_cmp_ngt_f16_e64 s[2:3], s4, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll index 8fe85e49a4207..746b00ba7b231 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll @@ -22,10 +22,10 @@ define amdgpu_kernel void @v_icmp_i32_eq(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX11-LABEL: v_icmp_i32_eq: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0x64, s2 +; SDAG-GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0x64, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -36,23 +36,23 @@ define amdgpu_kernel void @v_icmp_i32_eq(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX10-LABEL: v_icmp_i32_eq: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i32_eq: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0x64, s2 +; GISEL-GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0x64, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -63,13 +63,13 @@ define amdgpu_kernel void @v_icmp_i32_eq(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX10-LABEL: v_icmp_i32_eq: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 32) store i32 %result, ptr addrspace(1) %out @@ -87,7 +87,7 @@ define amdgpu_kernel void @v_icmp_i32(ptr addrspace(1) %out, i32 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_i32: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: global_store_b32 v0, v0, s[0:1] @@ -97,7 +97,7 @@ define amdgpu_kernel void @v_icmp_i32(ptr addrspace(1) %out, i32 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_i32: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: global_store_dword v0, v0, s[0:1] @@ -111,10 +111,10 @@ define amdgpu_kernel void @v_icmp_i32_ne(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX11-LABEL: v_icmp_i32_ne: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_ne_u32_e64 s2, 0x64, s2 +; SDAG-GFX11-NEXT: v_cmp_ne_u32_e64 s2, 0x64, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -125,23 +125,23 @@ define amdgpu_kernel void @v_icmp_i32_ne(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX10-LABEL: v_icmp_i32_ne: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_ne_u32_e64 s2, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i32_ne: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_ne_u32_e64 s2, 0x64, s2 +; GISEL-GFX11-NEXT: v_cmp_ne_u32_e64 s2, 0x64, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -152,13 +152,13 @@ define amdgpu_kernel void @v_icmp_i32_ne(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX10-LABEL: v_icmp_i32_ne: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_ne_u32_e64 s2, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 33) store i32 %result, ptr addrspace(1) %out @@ -169,10 +169,10 @@ define amdgpu_kernel void @v_icmp_i32_ugt(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX11-LABEL: v_icmp_i32_ugt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_lt_u32_e64 s2, 0x64, s2 +; SDAG-GFX11-NEXT: v_cmp_lt_u32_e64 s2, 0x64, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -183,23 +183,23 @@ define amdgpu_kernel void @v_icmp_i32_ugt(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX10-LABEL: v_icmp_i32_ugt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_lt_u32_e64 s0, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_lt_u32_e64 s2, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i32_ugt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_lt_u32_e64 s2, 0x64, s2 +; GISEL-GFX11-NEXT: v_cmp_lt_u32_e64 s2, 0x64, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -210,13 +210,13 @@ define amdgpu_kernel void @v_icmp_i32_ugt(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX10-LABEL: v_icmp_i32_ugt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_lt_u32_e64 s0, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_lt_u32_e64 s2, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 34) store i32 %result, ptr addrspace(1) %out @@ -227,10 +227,10 @@ define amdgpu_kernel void @v_icmp_i32_uge(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX11-LABEL: v_icmp_i32_uge: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_le_u32_e64 s2, 0x64, s2 +; SDAG-GFX11-NEXT: v_cmp_le_u32_e64 s2, 0x64, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -241,23 +241,23 @@ define amdgpu_kernel void @v_icmp_i32_uge(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX10-LABEL: v_icmp_i32_uge: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_le_u32_e64 s0, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_le_u32_e64 s2, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i32_uge: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_le_u32_e64 s2, 0x64, s2 +; GISEL-GFX11-NEXT: v_cmp_le_u32_e64 s2, 0x64, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -268,13 +268,13 @@ define amdgpu_kernel void @v_icmp_i32_uge(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX10-LABEL: v_icmp_i32_uge: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_le_u32_e64 s0, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_le_u32_e64 s2, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 35) store i32 %result, ptr addrspace(1) %out @@ -285,10 +285,10 @@ define amdgpu_kernel void @v_icmp_i32_ult(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX11-LABEL: v_icmp_i32_ult: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_gt_u32_e64 s2, 0x64, s2 +; SDAG-GFX11-NEXT: v_cmp_gt_u32_e64 s2, 0x64, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -299,23 +299,23 @@ define amdgpu_kernel void @v_icmp_i32_ult(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX10-LABEL: v_icmp_i32_ult: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_gt_u32_e64 s0, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_gt_u32_e64 s2, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i32_ult: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_gt_u32_e64 s2, 0x64, s2 +; GISEL-GFX11-NEXT: v_cmp_gt_u32_e64 s2, 0x64, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -326,13 +326,13 @@ define amdgpu_kernel void @v_icmp_i32_ult(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX10-LABEL: v_icmp_i32_ult: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_gt_u32_e64 s0, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_gt_u32_e64 s2, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 36) store i32 %result, ptr addrspace(1) %out @@ -343,10 +343,10 @@ define amdgpu_kernel void @v_icmp_i32_ule(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX11-LABEL: v_icmp_i32_ule: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_ge_u32_e64 s2, 0x64, s2 +; SDAG-GFX11-NEXT: v_cmp_ge_u32_e64 s2, 0x64, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -357,23 +357,23 @@ define amdgpu_kernel void @v_icmp_i32_ule(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX10-LABEL: v_icmp_i32_ule: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ge_u32_e64 s0, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_ge_u32_e64 s2, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i32_ule: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_ge_u32_e64 s2, 0x64, s2 +; GISEL-GFX11-NEXT: v_cmp_ge_u32_e64 s2, 0x64, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -384,13 +384,13 @@ define amdgpu_kernel void @v_icmp_i32_ule(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX10-LABEL: v_icmp_i32_ule: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ge_u32_e64 s0, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_ge_u32_e64 s2, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 37) store i32 %result, ptr addrspace(1) %out @@ -401,10 +401,10 @@ define amdgpu_kernel void @v_icmp_i32_sgt(ptr addrspace(1) %out, i32 %src) #1 { ; SDAG-GFX11-LABEL: v_icmp_i32_sgt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_lt_i32_e64 s2, 0x64, s2 +; SDAG-GFX11-NEXT: v_cmp_lt_i32_e64 s2, 0x64, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -415,23 +415,23 @@ define amdgpu_kernel void @v_icmp_i32_sgt(ptr addrspace(1) %out, i32 %src) #1 { ; SDAG-GFX10-LABEL: v_icmp_i32_sgt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_lt_i32_e64 s0, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_lt_i32_e64 s2, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i32_sgt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_lt_i32_e64 s2, 0x64, s2 +; GISEL-GFX11-NEXT: v_cmp_lt_i32_e64 s2, 0x64, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -442,13 +442,13 @@ define amdgpu_kernel void @v_icmp_i32_sgt(ptr addrspace(1) %out, i32 %src) #1 { ; GISEL-GFX10-LABEL: v_icmp_i32_sgt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_lt_i32_e64 s0, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_lt_i32_e64 s2, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 38) store i32 %result, ptr addrspace(1) %out @@ -459,10 +459,10 @@ define amdgpu_kernel void @v_icmp_i32_sge(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX11-LABEL: v_icmp_i32_sge: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_le_i32_e64 s2, 0x64, s2 +; SDAG-GFX11-NEXT: v_cmp_le_i32_e64 s2, 0x64, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -473,23 +473,23 @@ define amdgpu_kernel void @v_icmp_i32_sge(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX10-LABEL: v_icmp_i32_sge: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_le_i32_e64 s0, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_le_i32_e64 s2, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i32_sge: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_le_i32_e64 s2, 0x64, s2 +; GISEL-GFX11-NEXT: v_cmp_le_i32_e64 s2, 0x64, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -500,13 +500,13 @@ define amdgpu_kernel void @v_icmp_i32_sge(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX10-LABEL: v_icmp_i32_sge: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_le_i32_e64 s0, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_le_i32_e64 s2, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 39) store i32 %result, ptr addrspace(1) %out @@ -517,10 +517,10 @@ define amdgpu_kernel void @v_icmp_i32_slt(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX11-LABEL: v_icmp_i32_slt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_gt_i32_e64 s2, 0x64, s2 +; SDAG-GFX11-NEXT: v_cmp_gt_i32_e64 s2, 0x64, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -531,23 +531,23 @@ define amdgpu_kernel void @v_icmp_i32_slt(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX10-LABEL: v_icmp_i32_slt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_gt_i32_e64 s0, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_gt_i32_e64 s2, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i32_slt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_gt_i32_e64 s2, 0x64, s2 +; GISEL-GFX11-NEXT: v_cmp_gt_i32_e64 s2, 0x64, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -558,13 +558,13 @@ define amdgpu_kernel void @v_icmp_i32_slt(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX10-LABEL: v_icmp_i32_slt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_gt_i32_e64 s0, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_gt_i32_e64 s2, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 40) store i32 %result, ptr addrspace(1) %out @@ -575,10 +575,10 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX11-LABEL: v_icmp_i32_sle: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_ge_i32_e64 s2, 0x64, s2 +; SDAG-GFX11-NEXT: v_cmp_ge_i32_e64 s2, 0x64, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -589,23 +589,23 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX10-LABEL: v_icmp_i32_sle: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ge_i32_e64 s0, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_ge_i32_e64 s2, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i32_sle: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_ge_i32_e64 s2, 0x64, s2 +; GISEL-GFX11-NEXT: v_cmp_ge_i32_e64 s2, 0x64, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -616,13 +616,13 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX10-LABEL: v_icmp_i32_sle: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ge_i32_e64 s0, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_ge_i32_e64 s2, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 41) store i32 %result, ptr addrspace(1) %out @@ -632,7 +632,7 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) { define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_i64_eq: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_eq_u64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -644,7 +644,7 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_i64_eq: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_eq_u64_e64 s2, 0x64, s[2:3] @@ -654,7 +654,7 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_i64_eq: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_eq_u64_e64 s2, 0x64, s[2:3] @@ -667,7 +667,7 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_i64_eq: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_eq_u64_e64 s2, 0x64, s[2:3] @@ -682,7 +682,7 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_i64_ne: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_ne_u64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -694,7 +694,7 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_i64_ne: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_ne_u64_e64 s2, 0x64, s[2:3] @@ -704,7 +704,7 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_i64_ne: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_ne_u64_e64 s2, 0x64, s[2:3] @@ -717,7 +717,7 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_i64_ne: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_ne_u64_e64 s2, 0x64, s[2:3] @@ -732,7 +732,7 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_u64_ugt: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_lt_u64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -744,7 +744,7 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_u64_ugt: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_lt_u64_e64 s2, 0x64, s[2:3] @@ -754,7 +754,7 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_u64_ugt: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_lt_u64_e64 s2, 0x64, s[2:3] @@ -767,7 +767,7 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_u64_ugt: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_lt_u64_e64 s2, 0x64, s[2:3] @@ -782,7 +782,7 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_u64_uge: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_le_u64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -794,7 +794,7 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_u64_uge: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_le_u64_e64 s2, 0x64, s[2:3] @@ -804,7 +804,7 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_u64_uge: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_le_u64_e64 s2, 0x64, s[2:3] @@ -817,7 +817,7 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_u64_uge: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_le_u64_e64 s2, 0x64, s[2:3] @@ -832,7 +832,7 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_u64_ult: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_gt_u64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -844,7 +844,7 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_u64_ult: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_gt_u64_e64 s2, 0x64, s[2:3] @@ -854,7 +854,7 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_u64_ult: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_gt_u64_e64 s2, 0x64, s[2:3] @@ -867,7 +867,7 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_u64_ult: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_gt_u64_e64 s2, 0x64, s[2:3] @@ -882,7 +882,7 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_u64_ule: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_ge_u64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -894,7 +894,7 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_u64_ule: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_ge_u64_e64 s2, 0x64, s[2:3] @@ -904,7 +904,7 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_u64_ule: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_ge_u64_e64 s2, 0x64, s[2:3] @@ -917,7 +917,7 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_u64_ule: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_ge_u64_e64 s2, 0x64, s[2:3] @@ -932,7 +932,7 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_i64_sgt: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_lt_i64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -944,7 +944,7 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_i64_sgt: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_lt_i64_e64 s2, 0x64, s[2:3] @@ -954,7 +954,7 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_i64_sgt: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_lt_i64_e64 s2, 0x64, s[2:3] @@ -967,7 +967,7 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_i64_sgt: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_lt_i64_e64 s2, 0x64, s[2:3] @@ -982,7 +982,7 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_i64_sge: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_le_i64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -994,7 +994,7 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_i64_sge: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_le_i64_e64 s2, 0x64, s[2:3] @@ -1004,7 +1004,7 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_i64_sge: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_le_i64_e64 s2, 0x64, s[2:3] @@ -1017,7 +1017,7 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_i64_sge: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_le_i64_e64 s2, 0x64, s[2:3] @@ -1032,7 +1032,7 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_i64_slt: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_gt_i64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1044,7 +1044,7 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_i64_slt: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_gt_i64_e64 s2, 0x64, s[2:3] @@ -1054,7 +1054,7 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_i64_slt: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_gt_i64_e64 s2, 0x64, s[2:3] @@ -1067,7 +1067,7 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_i64_slt: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_gt_i64_e64 s2, 0x64, s[2:3] @@ -1082,7 +1082,7 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_i64_sle: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_ge_i64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1094,7 +1094,7 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_i64_sle: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_ge_i64_e64 s2, 0x64, s[2:3] @@ -1104,7 +1104,7 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_i64_sle: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_ge_i64_e64 s2, 0x64, s[2:3] @@ -1117,7 +1117,7 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_i64_sle: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_ge_i64_e64 s2, 0x64, s[2:3] @@ -1133,10 +1133,10 @@ define amdgpu_kernel void @v_icmp_i16_eq(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX11-LABEL: v_icmp_i16_eq: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_eq_u16_e64 s2, 0x64, s2 +; SDAG-GFX11-NEXT: v_cmp_eq_u16_e64 s2, 0x64, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1147,23 +1147,23 @@ define amdgpu_kernel void @v_icmp_i16_eq(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX10-LABEL: v_icmp_i16_eq: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_eq_u16_e64 s0, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_eq_u16_e64 s2, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i16_eq: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_eq_u16_e64 s2, 0x64, s2 +; GISEL-GFX11-NEXT: v_cmp_eq_u16_e64 s2, 0x64, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1174,13 +1174,13 @@ define amdgpu_kernel void @v_icmp_i16_eq(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX10-LABEL: v_icmp_i16_eq: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_eq_u16_e64 s0, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_eq_u16_e64 s2, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 32) store i32 %result, ptr addrspace(1) %out @@ -1198,7 +1198,7 @@ define amdgpu_kernel void @v_icmp_i16(ptr addrspace(1) %out, i16 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_i16: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: global_store_b32 v0, v0, s[0:1] @@ -1208,7 +1208,7 @@ define amdgpu_kernel void @v_icmp_i16(ptr addrspace(1) %out, i16 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_i16: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: global_store_dword v0, v0, s[0:1] @@ -1222,10 +1222,10 @@ define amdgpu_kernel void @v_icmp_i16_ne(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX11-LABEL: v_icmp_i16_ne: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_ne_u16_e64 s2, 0x64, s2 +; SDAG-GFX11-NEXT: v_cmp_ne_u16_e64 s2, 0x64, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1236,23 +1236,23 @@ define amdgpu_kernel void @v_icmp_i16_ne(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX10-LABEL: v_icmp_i16_ne: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ne_u16_e64 s0, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_ne_u16_e64 s2, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i16_ne: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_ne_u16_e64 s2, 0x64, s2 +; GISEL-GFX11-NEXT: v_cmp_ne_u16_e64 s2, 0x64, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1263,13 +1263,13 @@ define amdgpu_kernel void @v_icmp_i16_ne(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX10-LABEL: v_icmp_i16_ne: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ne_u16_e64 s0, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_ne_u16_e64 s2, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 33) store i32 %result, ptr addrspace(1) %out @@ -1280,10 +1280,10 @@ define amdgpu_kernel void @v_icmp_i16_ugt(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX11-LABEL: v_icmp_i16_ugt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_lt_u16_e64 s2, 0x64, s2 +; SDAG-GFX11-NEXT: v_cmp_lt_u16_e64 s2, 0x64, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1294,23 +1294,23 @@ define amdgpu_kernel void @v_icmp_i16_ugt(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX10-LABEL: v_icmp_i16_ugt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_lt_u16_e64 s0, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_lt_u16_e64 s2, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i16_ugt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_lt_u16_e64 s2, 0x64, s2 +; GISEL-GFX11-NEXT: v_cmp_lt_u16_e64 s2, 0x64, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1321,13 +1321,13 @@ define amdgpu_kernel void @v_icmp_i16_ugt(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX10-LABEL: v_icmp_i16_ugt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_lt_u16_e64 s0, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_lt_u16_e64 s2, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 34) store i32 %result, ptr addrspace(1) %out @@ -1338,10 +1338,10 @@ define amdgpu_kernel void @v_icmp_i16_uge(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX11-LABEL: v_icmp_i16_uge: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_le_u16_e64 s2, 0x64, s2 +; SDAG-GFX11-NEXT: v_cmp_le_u16_e64 s2, 0x64, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1352,23 +1352,23 @@ define amdgpu_kernel void @v_icmp_i16_uge(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX10-LABEL: v_icmp_i16_uge: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_le_u16_e64 s0, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_le_u16_e64 s2, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i16_uge: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_le_u16_e64 s2, 0x64, s2 +; GISEL-GFX11-NEXT: v_cmp_le_u16_e64 s2, 0x64, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1379,13 +1379,13 @@ define amdgpu_kernel void @v_icmp_i16_uge(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX10-LABEL: v_icmp_i16_uge: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_le_u16_e64 s0, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_le_u16_e64 s2, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 35) store i32 %result, ptr addrspace(1) %out @@ -1396,10 +1396,10 @@ define amdgpu_kernel void @v_icmp_i16_ult(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX11-LABEL: v_icmp_i16_ult: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_gt_u16_e64 s2, 0x64, s2 +; SDAG-GFX11-NEXT: v_cmp_gt_u16_e64 s2, 0x64, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1410,23 +1410,23 @@ define amdgpu_kernel void @v_icmp_i16_ult(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX10-LABEL: v_icmp_i16_ult: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_gt_u16_e64 s0, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_gt_u16_e64 s2, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i16_ult: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_gt_u16_e64 s2, 0x64, s2 +; GISEL-GFX11-NEXT: v_cmp_gt_u16_e64 s2, 0x64, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1437,13 +1437,13 @@ define amdgpu_kernel void @v_icmp_i16_ult(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX10-LABEL: v_icmp_i16_ult: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_gt_u16_e64 s0, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_gt_u16_e64 s2, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 36) store i32 %result, ptr addrspace(1) %out @@ -1454,10 +1454,10 @@ define amdgpu_kernel void @v_icmp_i16_ule(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX11-LABEL: v_icmp_i16_ule: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_ge_u16_e64 s2, 0x64, s2 +; SDAG-GFX11-NEXT: v_cmp_ge_u16_e64 s2, 0x64, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1468,23 +1468,23 @@ define amdgpu_kernel void @v_icmp_i16_ule(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX10-LABEL: v_icmp_i16_ule: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ge_u16_e64 s0, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_ge_u16_e64 s2, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i16_ule: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_ge_u16_e64 s2, 0x64, s2 +; GISEL-GFX11-NEXT: v_cmp_ge_u16_e64 s2, 0x64, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1495,13 +1495,13 @@ define amdgpu_kernel void @v_icmp_i16_ule(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX10-LABEL: v_icmp_i16_ule: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ge_u16_e64 s0, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_ge_u16_e64 s2, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 37) store i32 %result, ptr addrspace(1) %out @@ -1512,10 +1512,10 @@ define amdgpu_kernel void @v_icmp_i16_sgt(ptr addrspace(1) %out, i16 %src) #1 { ; SDAG-GFX11-LABEL: v_icmp_i16_sgt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_lt_i16_e64 s2, 0x64, s2 +; SDAG-GFX11-NEXT: v_cmp_lt_i16_e64 s2, 0x64, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1526,23 +1526,23 @@ define amdgpu_kernel void @v_icmp_i16_sgt(ptr addrspace(1) %out, i16 %src) #1 { ; SDAG-GFX10-LABEL: v_icmp_i16_sgt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_lt_i16_e64 s0, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_lt_i16_e64 s2, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i16_sgt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_lt_i16_e64 s2, 0x64, s2 +; GISEL-GFX11-NEXT: v_cmp_lt_i16_e64 s2, 0x64, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1553,13 +1553,13 @@ define amdgpu_kernel void @v_icmp_i16_sgt(ptr addrspace(1) %out, i16 %src) #1 { ; GISEL-GFX10-LABEL: v_icmp_i16_sgt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_lt_i16_e64 s0, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_lt_i16_e64 s2, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 38) store i32 %result, ptr addrspace(1) %out @@ -1570,10 +1570,10 @@ define amdgpu_kernel void @v_icmp_i16_sge(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX11-LABEL: v_icmp_i16_sge: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_le_i16_e64 s2, 0x64, s2 +; SDAG-GFX11-NEXT: v_cmp_le_i16_e64 s2, 0x64, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1584,23 +1584,23 @@ define amdgpu_kernel void @v_icmp_i16_sge(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX10-LABEL: v_icmp_i16_sge: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_le_i16_e64 s0, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_le_i16_e64 s2, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i16_sge: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_le_i16_e64 s2, 0x64, s2 +; GISEL-GFX11-NEXT: v_cmp_le_i16_e64 s2, 0x64, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1611,13 +1611,13 @@ define amdgpu_kernel void @v_icmp_i16_sge(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX10-LABEL: v_icmp_i16_sge: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_le_i16_e64 s0, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_le_i16_e64 s2, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 39) store i32 %result, ptr addrspace(1) %out @@ -1628,10 +1628,10 @@ define amdgpu_kernel void @v_icmp_i16_slt(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX11-LABEL: v_icmp_i16_slt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_gt_i16_e64 s2, 0x64, s2 +; SDAG-GFX11-NEXT: v_cmp_gt_i16_e64 s2, 0x64, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1642,23 +1642,23 @@ define amdgpu_kernel void @v_icmp_i16_slt(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX10-LABEL: v_icmp_i16_slt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_gt_i16_e64 s0, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_gt_i16_e64 s2, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i16_slt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_gt_i16_e64 s2, 0x64, s2 +; GISEL-GFX11-NEXT: v_cmp_gt_i16_e64 s2, 0x64, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1669,13 +1669,13 @@ define amdgpu_kernel void @v_icmp_i16_slt(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX10-LABEL: v_icmp_i16_slt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_gt_i16_e64 s0, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_gt_i16_e64 s2, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 40) store i32 %result, ptr addrspace(1) %out @@ -1686,10 +1686,10 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX11-LABEL: v_icmp_i16_sle: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_ge_i16_e64 s2, 0x64, s2 +; SDAG-GFX11-NEXT: v_cmp_ge_i16_e64 s2, 0x64, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1700,23 +1700,23 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX10-LABEL: v_icmp_i16_sle: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ge_i16_e64 s0, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_ge_i16_e64 s2, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i16_sle: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_ge_i16_e64 s2, 0x64, s2 +; GISEL-GFX11-NEXT: v_cmp_ge_i16_e64 s2, 0x64, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1727,13 +1727,13 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX10-LABEL: v_icmp_i16_sle: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ge_i16_e64 s0, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_ge_i16_e64 s2, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 41) store i32 %result, ptr addrspace(1) %out @@ -1743,7 +1743,7 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) { define amdgpu_kernel void @v_icmp_i1_ne0(ptr addrspace(1) %out, i32 %a, i32 %b) { ; GFX11-LABEL: v_icmp_i1_ne0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_cmp_gt_u32 s2, 1 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 @@ -1759,7 +1759,7 @@ define amdgpu_kernel void @v_icmp_i1_ne0(ptr addrspace(1) %out, i32 %a, i32 %b) ; ; GFX10-LABEL: v_icmp_i1_ne0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cmp_gt_u32 s6, 1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll index a650f999835c6..2625c1f152219 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll @@ -25,11 +25,11 @@ define amdgpu_kernel void @v_icmp_i32_eq(ptr addrspace(1) %out, i32 %src) { ; GFX11-LABEL: v_icmp_i32_eq: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e64 s[2:3], 0x64, s2 +; GFX11-NEXT: v_cmp_eq_u32_e64 s[2:3], 0x64, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -40,11 +40,11 @@ define amdgpu_kernel void @v_icmp_i32_eq(ptr addrspace(1) %out, i32 %src) { ; ; SDAG-VI-LABEL: v_icmp_i32_eq: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_eq_u32_e64 s[2:3], s2, v0 +; SDAG-VI-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -54,24 +54,24 @@ define amdgpu_kernel void @v_icmp_i32_eq(ptr addrspace(1) %out, i32 %src) { ; ; GFX9-LABEL: v_icmp_i32_eq: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i32_eq: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_eq_u32_e64 s[2:3], s2, v0 +; GISEL-VI-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -98,7 +98,7 @@ define amdgpu_kernel void @v_icmp_i32(ptr addrspace(1) %out, i32 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_i32: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -108,7 +108,7 @@ define amdgpu_kernel void @v_icmp_i32(ptr addrspace(1) %out, i32 %src) { ; ; GISEL-VI-LABEL: v_icmp_i32: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -117,7 +117,7 @@ define amdgpu_kernel void @v_icmp_i32(ptr addrspace(1) %out, i32 %src) { ; ; GISEL-GFX9-LABEL: v_icmp_i32: ; GISEL-GFX9: ; %bb.0: -; GISEL-GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1] @@ -131,11 +131,11 @@ define amdgpu_kernel void @v_icmp_i32_ne(ptr addrspace(1) %out, i32 %src) { ; GFX11-LABEL: v_icmp_i32_ne: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e64 s[2:3], 0x64, s2 +; GFX11-NEXT: v_cmp_ne_u32_e64 s[2:3], 0x64, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -146,11 +146,11 @@ define amdgpu_kernel void @v_icmp_i32_ne(ptr addrspace(1) %out, i32 %src) { ; ; SDAG-VI-LABEL: v_icmp_i32_ne: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_ne_u32_e64 s[2:3], s2, v0 +; SDAG-VI-NEXT: v_cmp_ne_u32_e64 s[2:3], s4, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -160,24 +160,24 @@ define amdgpu_kernel void @v_icmp_i32_ne(ptr addrspace(1) %out, i32 %src) { ; ; GFX9-LABEL: v_icmp_i32_ne: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i32_ne: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_ne_u32_e64 s[2:3], s2, v0 +; GISEL-VI-NEXT: v_cmp_ne_u32_e64 s[2:3], s4, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -193,11 +193,11 @@ define amdgpu_kernel void @v_icmp_i32_ugt(ptr addrspace(1) %out, i32 %src) { ; GFX11-LABEL: v_icmp_i32_ugt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_u32_e64 s[2:3], 0x64, s2 +; GFX11-NEXT: v_cmp_lt_u32_e64 s[2:3], 0x64, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -208,11 +208,11 @@ define amdgpu_kernel void @v_icmp_i32_ugt(ptr addrspace(1) %out, i32 %src) { ; ; SDAG-VI-LABEL: v_icmp_i32_ugt: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_gt_u32_e64 s[2:3], s2, v0 +; SDAG-VI-NEXT: v_cmp_gt_u32_e64 s[2:3], s4, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -222,24 +222,24 @@ define amdgpu_kernel void @v_icmp_i32_ugt(ptr addrspace(1) %out, i32 %src) { ; ; GFX9-LABEL: v_icmp_i32_ugt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_u32_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_gt_u32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i32_ugt: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_gt_u32_e64 s[2:3], s2, v0 +; GISEL-VI-NEXT: v_cmp_gt_u32_e64 s[2:3], s4, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -255,11 +255,11 @@ define amdgpu_kernel void @v_icmp_i32_uge(ptr addrspace(1) %out, i32 %src) { ; GFX11-LABEL: v_icmp_i32_uge: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_le_u32_e64 s[2:3], 0x64, s2 +; GFX11-NEXT: v_cmp_le_u32_e64 s[2:3], 0x64, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -270,11 +270,11 @@ define amdgpu_kernel void @v_icmp_i32_uge(ptr addrspace(1) %out, i32 %src) { ; ; SDAG-VI-LABEL: v_icmp_i32_uge: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_ge_u32_e64 s[2:3], s2, v0 +; SDAG-VI-NEXT: v_cmp_ge_u32_e64 s[2:3], s4, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -284,24 +284,24 @@ define amdgpu_kernel void @v_icmp_i32_uge(ptr addrspace(1) %out, i32 %src) { ; ; GFX9-LABEL: v_icmp_i32_uge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ge_u32_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_ge_u32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i32_uge: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_ge_u32_e64 s[2:3], s2, v0 +; GISEL-VI-NEXT: v_cmp_ge_u32_e64 s[2:3], s4, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -317,11 +317,11 @@ define amdgpu_kernel void @v_icmp_i32_ult(ptr addrspace(1) %out, i32 %src) { ; GFX11-LABEL: v_icmp_i32_ult: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_u32_e64 s[2:3], 0x64, s2 +; GFX11-NEXT: v_cmp_gt_u32_e64 s[2:3], 0x64, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -332,11 +332,11 @@ define amdgpu_kernel void @v_icmp_i32_ult(ptr addrspace(1) %out, i32 %src) { ; ; SDAG-VI-LABEL: v_icmp_i32_ult: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_lt_u32_e64 s[2:3], s2, v0 +; SDAG-VI-NEXT: v_cmp_lt_u32_e64 s[2:3], s4, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -346,24 +346,24 @@ define amdgpu_kernel void @v_icmp_i32_ult(ptr addrspace(1) %out, i32 %src) { ; ; GFX9-LABEL: v_icmp_i32_ult: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_u32_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_lt_u32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i32_ult: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_lt_u32_e64 s[2:3], s2, v0 +; GISEL-VI-NEXT: v_cmp_lt_u32_e64 s[2:3], s4, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -379,11 +379,11 @@ define amdgpu_kernel void @v_icmp_i32_ule(ptr addrspace(1) %out, i32 %src) { ; GFX11-LABEL: v_icmp_i32_ule: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_ge_u32_e64 s[2:3], 0x64, s2 +; GFX11-NEXT: v_cmp_ge_u32_e64 s[2:3], 0x64, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -394,11 +394,11 @@ define amdgpu_kernel void @v_icmp_i32_ule(ptr addrspace(1) %out, i32 %src) { ; ; SDAG-VI-LABEL: v_icmp_i32_ule: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_le_u32_e64 s[2:3], s2, v0 +; SDAG-VI-NEXT: v_cmp_le_u32_e64 s[2:3], s4, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -408,24 +408,24 @@ define amdgpu_kernel void @v_icmp_i32_ule(ptr addrspace(1) %out, i32 %src) { ; ; GFX9-LABEL: v_icmp_i32_ule: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i32_ule: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_le_u32_e64 s[2:3], s2, v0 +; GISEL-VI-NEXT: v_cmp_le_u32_e64 s[2:3], s4, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -441,11 +441,11 @@ define amdgpu_kernel void @v_icmp_i32_sgt(ptr addrspace(1) %out, i32 %src) #1 { ; GFX11-LABEL: v_icmp_i32_sgt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_i32_e64 s[2:3], 0x64, s2 +; GFX11-NEXT: v_cmp_lt_i32_e64 s[2:3], 0x64, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -456,11 +456,11 @@ define amdgpu_kernel void @v_icmp_i32_sgt(ptr addrspace(1) %out, i32 %src) #1 { ; ; SDAG-VI-LABEL: v_icmp_i32_sgt: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_gt_i32_e64 s[2:3], s2, v0 +; SDAG-VI-NEXT: v_cmp_gt_i32_e64 s[2:3], s4, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -470,24 +470,24 @@ define amdgpu_kernel void @v_icmp_i32_sgt(ptr addrspace(1) %out, i32 %src) #1 { ; ; GFX9-LABEL: v_icmp_i32_sgt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_i32_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_gt_i32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i32_sgt: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_gt_i32_e64 s[2:3], s2, v0 +; GISEL-VI-NEXT: v_cmp_gt_i32_e64 s[2:3], s4, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -503,11 +503,11 @@ define amdgpu_kernel void @v_icmp_i32_sge(ptr addrspace(1) %out, i32 %src) { ; GFX11-LABEL: v_icmp_i32_sge: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_le_i32_e64 s[2:3], 0x64, s2 +; GFX11-NEXT: v_cmp_le_i32_e64 s[2:3], 0x64, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -518,11 +518,11 @@ define amdgpu_kernel void @v_icmp_i32_sge(ptr addrspace(1) %out, i32 %src) { ; ; SDAG-VI-LABEL: v_icmp_i32_sge: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_ge_i32_e64 s[2:3], s2, v0 +; SDAG-VI-NEXT: v_cmp_ge_i32_e64 s[2:3], s4, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -532,24 +532,24 @@ define amdgpu_kernel void @v_icmp_i32_sge(ptr addrspace(1) %out, i32 %src) { ; ; GFX9-LABEL: v_icmp_i32_sge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ge_i32_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_ge_i32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i32_sge: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_ge_i32_e64 s[2:3], s2, v0 +; GISEL-VI-NEXT: v_cmp_ge_i32_e64 s[2:3], s4, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -565,11 +565,11 @@ define amdgpu_kernel void @v_icmp_i32_slt(ptr addrspace(1) %out, i32 %src) { ; GFX11-LABEL: v_icmp_i32_slt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_i32_e64 s[2:3], 0x64, s2 +; GFX11-NEXT: v_cmp_gt_i32_e64 s[2:3], 0x64, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -580,11 +580,11 @@ define amdgpu_kernel void @v_icmp_i32_slt(ptr addrspace(1) %out, i32 %src) { ; ; SDAG-VI-LABEL: v_icmp_i32_slt: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_lt_i32_e64 s[2:3], s2, v0 +; SDAG-VI-NEXT: v_cmp_lt_i32_e64 s[2:3], s4, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -594,24 +594,24 @@ define amdgpu_kernel void @v_icmp_i32_slt(ptr addrspace(1) %out, i32 %src) { ; ; GFX9-LABEL: v_icmp_i32_slt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_i32_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_lt_i32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i32_slt: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_lt_i32_e64 s[2:3], s2, v0 +; GISEL-VI-NEXT: v_cmp_lt_i32_e64 s[2:3], s4, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -627,11 +627,11 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) { ; GFX11-LABEL: v_icmp_i32_sle: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_ge_i32_e64 s[2:3], 0x64, s2 +; GFX11-NEXT: v_cmp_ge_i32_e64 s[2:3], 0x64, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -642,11 +642,11 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) { ; ; SDAG-VI-LABEL: v_icmp_i32_sle: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_le_i32_e64 s[2:3], s2, v0 +; SDAG-VI-NEXT: v_cmp_le_i32_e64 s[2:3], s4, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -656,24 +656,24 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) { ; ; GFX9-LABEL: v_icmp_i32_sle: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_le_i32_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_le_i32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i32_sle: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_le_i32_e64 s[2:3], s2, v0 +; GISEL-VI-NEXT: v_cmp_le_i32_e64 s[2:3], s4, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -688,7 +688,7 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) { define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_i64_eq: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_u64_e64 s[2:3], 0x64, s[2:3] @@ -702,7 +702,7 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-VI-LABEL: v_icmp_i64_eq: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -716,7 +716,7 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_i64_eq: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -729,7 +729,7 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-VI-LABEL: v_icmp_i64_eq: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -748,7 +748,7 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_i64_ne: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_ne_u64_e64 s[2:3], 0x64, s[2:3] @@ -762,7 +762,7 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-VI-LABEL: v_icmp_i64_ne: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -776,7 +776,7 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_i64_ne: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -789,7 +789,7 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-VI-LABEL: v_icmp_i64_ne: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -808,7 +808,7 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_u64_ugt: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_lt_u64_e64 s[2:3], 0x64, s[2:3] @@ -822,7 +822,7 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-VI-LABEL: v_icmp_u64_ugt: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -836,7 +836,7 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_u64_ugt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -849,7 +849,7 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-VI-LABEL: v_icmp_u64_ugt: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -868,7 +868,7 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_u64_uge: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_le_u64_e64 s[2:3], 0x64, s[2:3] @@ -882,7 +882,7 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-VI-LABEL: v_icmp_u64_uge: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -896,7 +896,7 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_u64_uge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -909,7 +909,7 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-VI-LABEL: v_icmp_u64_uge: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -928,7 +928,7 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_u64_ult: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_gt_u64_e64 s[2:3], 0x64, s[2:3] @@ -942,7 +942,7 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-VI-LABEL: v_icmp_u64_ult: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -956,7 +956,7 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_u64_ult: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -969,7 +969,7 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-VI-LABEL: v_icmp_u64_ult: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -988,7 +988,7 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_u64_ule: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_ge_u64_e64 s[2:3], 0x64, s[2:3] @@ -1002,7 +1002,7 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-VI-LABEL: v_icmp_u64_ule: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1016,7 +1016,7 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_u64_ule: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1029,7 +1029,7 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-VI-LABEL: v_icmp_u64_ule: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1048,7 +1048,7 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_i64_sgt: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_lt_i64_e64 s[2:3], 0x64, s[2:3] @@ -1062,7 +1062,7 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-VI-LABEL: v_icmp_i64_sgt: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1076,7 +1076,7 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_i64_sgt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1089,7 +1089,7 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-VI-LABEL: v_icmp_i64_sgt: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1108,7 +1108,7 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_i64_sge: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_le_i64_e64 s[2:3], 0x64, s[2:3] @@ -1122,7 +1122,7 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-VI-LABEL: v_icmp_i64_sge: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1136,7 +1136,7 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_i64_sge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1149,7 +1149,7 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-VI-LABEL: v_icmp_i64_sge: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1168,7 +1168,7 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_i64_slt: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_gt_i64_e64 s[2:3], 0x64, s[2:3] @@ -1182,7 +1182,7 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-VI-LABEL: v_icmp_i64_slt: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1196,7 +1196,7 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_i64_slt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1209,7 +1209,7 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-VI-LABEL: v_icmp_i64_slt: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1228,7 +1228,7 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_i64_sle: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_ge_i64_e64 s[2:3], 0x64, s[2:3] @@ -1242,7 +1242,7 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-VI-LABEL: v_icmp_i64_sle: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1256,7 +1256,7 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_i64_sle: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1269,7 +1269,7 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-VI-LABEL: v_icmp_i64_sle: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1289,11 +1289,11 @@ define amdgpu_kernel void @v_icmp_i16_eq(ptr addrspace(1) %out, i16 %src) { ; GFX11-LABEL: v_icmp_i16_eq: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_eq_u16_e64 s[2:3], 0x64, s2 +; GFX11-NEXT: v_cmp_eq_u16_e64 s[2:3], 0x64, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1304,11 +1304,11 @@ define amdgpu_kernel void @v_icmp_i16_eq(ptr addrspace(1) %out, i16 %src) { ; ; SDAG-VI-LABEL: v_icmp_i16_eq: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_eq_u16_e64 s[2:3], s2, v0 +; SDAG-VI-NEXT: v_cmp_eq_u16_e64 s[2:3], s4, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1318,24 +1318,24 @@ define amdgpu_kernel void @v_icmp_i16_eq(ptr addrspace(1) %out, i16 %src) { ; ; GFX9-LABEL: v_icmp_i16_eq: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_u16_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_eq_u16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i16_eq: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_eq_u16_e64 s[2:3], s2, v0 +; GISEL-VI-NEXT: v_cmp_eq_u16_e64 s[2:3], s4, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1362,7 +1362,7 @@ define amdgpu_kernel void @v_icmp_i16(ptr addrspace(1) %out, i16 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_i16: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1372,7 +1372,7 @@ define amdgpu_kernel void @v_icmp_i16(ptr addrspace(1) %out, i16 %src) { ; ; GISEL-VI-LABEL: v_icmp_i16: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1381,7 +1381,7 @@ define amdgpu_kernel void @v_icmp_i16(ptr addrspace(1) %out, i16 %src) { ; ; GISEL-GFX9-LABEL: v_icmp_i16: ; GISEL-GFX9: ; %bb.0: -; GISEL-GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1] @@ -1395,11 +1395,11 @@ define amdgpu_kernel void @v_icmp_i16_ne(ptr addrspace(1) %out, i16 %src) { ; GFX11-LABEL: v_icmp_i16_ne: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u16_e64 s[2:3], 0x64, s2 +; GFX11-NEXT: v_cmp_ne_u16_e64 s[2:3], 0x64, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1410,11 +1410,11 @@ define amdgpu_kernel void @v_icmp_i16_ne(ptr addrspace(1) %out, i16 %src) { ; ; SDAG-VI-LABEL: v_icmp_i16_ne: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_ne_u16_e64 s[2:3], s2, v0 +; SDAG-VI-NEXT: v_cmp_ne_u16_e64 s[2:3], s4, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1424,24 +1424,24 @@ define amdgpu_kernel void @v_icmp_i16_ne(ptr addrspace(1) %out, i16 %src) { ; ; GFX9-LABEL: v_icmp_i16_ne: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u16_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_ne_u16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i16_ne: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_ne_u16_e64 s[2:3], s2, v0 +; GISEL-VI-NEXT: v_cmp_ne_u16_e64 s[2:3], s4, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1457,11 +1457,11 @@ define amdgpu_kernel void @v_icmp_i16_ugt(ptr addrspace(1) %out, i16 %src) { ; GFX11-LABEL: v_icmp_i16_ugt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_u16_e64 s[2:3], 0x64, s2 +; GFX11-NEXT: v_cmp_lt_u16_e64 s[2:3], 0x64, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1472,11 +1472,11 @@ define amdgpu_kernel void @v_icmp_i16_ugt(ptr addrspace(1) %out, i16 %src) { ; ; SDAG-VI-LABEL: v_icmp_i16_ugt: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_gt_u16_e64 s[2:3], s2, v0 +; SDAG-VI-NEXT: v_cmp_gt_u16_e64 s[2:3], s4, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1486,24 +1486,24 @@ define amdgpu_kernel void @v_icmp_i16_ugt(ptr addrspace(1) %out, i16 %src) { ; ; GFX9-LABEL: v_icmp_i16_ugt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_u16_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_gt_u16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i16_ugt: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_gt_u16_e64 s[2:3], s2, v0 +; GISEL-VI-NEXT: v_cmp_gt_u16_e64 s[2:3], s4, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1519,11 +1519,11 @@ define amdgpu_kernel void @v_icmp_i16_uge(ptr addrspace(1) %out, i16 %src) { ; GFX11-LABEL: v_icmp_i16_uge: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_le_u16_e64 s[2:3], 0x64, s2 +; GFX11-NEXT: v_cmp_le_u16_e64 s[2:3], 0x64, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1534,11 +1534,11 @@ define amdgpu_kernel void @v_icmp_i16_uge(ptr addrspace(1) %out, i16 %src) { ; ; SDAG-VI-LABEL: v_icmp_i16_uge: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_ge_u16_e64 s[2:3], s2, v0 +; SDAG-VI-NEXT: v_cmp_ge_u16_e64 s[2:3], s4, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1548,24 +1548,24 @@ define amdgpu_kernel void @v_icmp_i16_uge(ptr addrspace(1) %out, i16 %src) { ; ; GFX9-LABEL: v_icmp_i16_uge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ge_u16_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_ge_u16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i16_uge: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_ge_u16_e64 s[2:3], s2, v0 +; GISEL-VI-NEXT: v_cmp_ge_u16_e64 s[2:3], s4, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1581,11 +1581,11 @@ define amdgpu_kernel void @v_icmp_i16_ult(ptr addrspace(1) %out, i16 %src) { ; GFX11-LABEL: v_icmp_i16_ult: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_u16_e64 s[2:3], 0x64, s2 +; GFX11-NEXT: v_cmp_gt_u16_e64 s[2:3], 0x64, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1596,11 +1596,11 @@ define amdgpu_kernel void @v_icmp_i16_ult(ptr addrspace(1) %out, i16 %src) { ; ; SDAG-VI-LABEL: v_icmp_i16_ult: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_lt_u16_e64 s[2:3], s2, v0 +; SDAG-VI-NEXT: v_cmp_lt_u16_e64 s[2:3], s4, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1610,24 +1610,24 @@ define amdgpu_kernel void @v_icmp_i16_ult(ptr addrspace(1) %out, i16 %src) { ; ; GFX9-LABEL: v_icmp_i16_ult: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_u16_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_lt_u16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i16_ult: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_lt_u16_e64 s[2:3], s2, v0 +; GISEL-VI-NEXT: v_cmp_lt_u16_e64 s[2:3], s4, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1643,11 +1643,11 @@ define amdgpu_kernel void @v_icmp_i16_ule(ptr addrspace(1) %out, i16 %src) { ; GFX11-LABEL: v_icmp_i16_ule: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_ge_u16_e64 s[2:3], 0x64, s2 +; GFX11-NEXT: v_cmp_ge_u16_e64 s[2:3], 0x64, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1658,11 +1658,11 @@ define amdgpu_kernel void @v_icmp_i16_ule(ptr addrspace(1) %out, i16 %src) { ; ; SDAG-VI-LABEL: v_icmp_i16_ule: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_le_u16_e64 s[2:3], s2, v0 +; SDAG-VI-NEXT: v_cmp_le_u16_e64 s[2:3], s4, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1672,24 +1672,24 @@ define amdgpu_kernel void @v_icmp_i16_ule(ptr addrspace(1) %out, i16 %src) { ; ; GFX9-LABEL: v_icmp_i16_ule: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_le_u16_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_le_u16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i16_ule: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_le_u16_e64 s[2:3], s2, v0 +; GISEL-VI-NEXT: v_cmp_le_u16_e64 s[2:3], s4, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1705,11 +1705,11 @@ define amdgpu_kernel void @v_icmp_i16_sgt(ptr addrspace(1) %out, i16 %src) #1 { ; GFX11-LABEL: v_icmp_i16_sgt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_i16_e64 s[2:3], 0x64, s2 +; GFX11-NEXT: v_cmp_lt_i16_e64 s[2:3], 0x64, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1720,11 +1720,11 @@ define amdgpu_kernel void @v_icmp_i16_sgt(ptr addrspace(1) %out, i16 %src) #1 { ; ; SDAG-VI-LABEL: v_icmp_i16_sgt: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_gt_i16_e64 s[2:3], s2, v0 +; SDAG-VI-NEXT: v_cmp_gt_i16_e64 s[2:3], s4, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1734,24 +1734,24 @@ define amdgpu_kernel void @v_icmp_i16_sgt(ptr addrspace(1) %out, i16 %src) #1 { ; ; GFX9-LABEL: v_icmp_i16_sgt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_i16_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_gt_i16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i16_sgt: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_gt_i16_e64 s[2:3], s2, v0 +; GISEL-VI-NEXT: v_cmp_gt_i16_e64 s[2:3], s4, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1767,11 +1767,11 @@ define amdgpu_kernel void @v_icmp_i16_sge(ptr addrspace(1) %out, i16 %src) { ; GFX11-LABEL: v_icmp_i16_sge: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_le_i16_e64 s[2:3], 0x64, s2 +; GFX11-NEXT: v_cmp_le_i16_e64 s[2:3], 0x64, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1782,11 +1782,11 @@ define amdgpu_kernel void @v_icmp_i16_sge(ptr addrspace(1) %out, i16 %src) { ; ; SDAG-VI-LABEL: v_icmp_i16_sge: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_ge_i16_e64 s[2:3], s2, v0 +; SDAG-VI-NEXT: v_cmp_ge_i16_e64 s[2:3], s4, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1796,24 +1796,24 @@ define amdgpu_kernel void @v_icmp_i16_sge(ptr addrspace(1) %out, i16 %src) { ; ; GFX9-LABEL: v_icmp_i16_sge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ge_i16_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_ge_i16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i16_sge: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_ge_i16_e64 s[2:3], s2, v0 +; GISEL-VI-NEXT: v_cmp_ge_i16_e64 s[2:3], s4, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1829,11 +1829,11 @@ define amdgpu_kernel void @v_icmp_i16_slt(ptr addrspace(1) %out, i16 %src) { ; GFX11-LABEL: v_icmp_i16_slt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_i16_e64 s[2:3], 0x64, s2 +; GFX11-NEXT: v_cmp_gt_i16_e64 s[2:3], 0x64, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1844,11 +1844,11 @@ define amdgpu_kernel void @v_icmp_i16_slt(ptr addrspace(1) %out, i16 %src) { ; ; SDAG-VI-LABEL: v_icmp_i16_slt: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_lt_i16_e64 s[2:3], s2, v0 +; SDAG-VI-NEXT: v_cmp_lt_i16_e64 s[2:3], s4, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1858,24 +1858,24 @@ define amdgpu_kernel void @v_icmp_i16_slt(ptr addrspace(1) %out, i16 %src) { ; ; GFX9-LABEL: v_icmp_i16_slt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_i16_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_lt_i16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i16_slt: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_lt_i16_e64 s[2:3], s2, v0 +; GISEL-VI-NEXT: v_cmp_lt_i16_e64 s[2:3], s4, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1891,11 +1891,11 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) { ; GFX11-LABEL: v_icmp_i16_sle: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_ge_i16_e64 s[2:3], 0x64, s2 +; GFX11-NEXT: v_cmp_ge_i16_e64 s[2:3], 0x64, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1906,11 +1906,11 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) { ; ; SDAG-VI-LABEL: v_icmp_i16_sle: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_le_i16_e64 s[2:3], s2, v0 +; SDAG-VI-NEXT: v_cmp_le_i16_e64 s[2:3], s4, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1920,24 +1920,24 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) { ; ; GFX9-LABEL: v_icmp_i16_sle: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_le_i16_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_le_i16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i16_sle: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_le_i16_e64 s[2:3], s2, v0 +; GISEL-VI-NEXT: v_cmp_le_i16_e64 s[2:3], s4, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1952,7 +1952,7 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) { define amdgpu_kernel void @v_icmp_i1_ne0(ptr addrspace(1) %out, i32 %a, i32 %b) { ; GFX11-LABEL: v_icmp_i1_ne0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_cmp_gt_u32 s2, 1 @@ -1970,7 +1970,7 @@ define amdgpu_kernel void @v_icmp_i1_ne0(ptr addrspace(1) %out, i32 %a, i32 %b) ; ; VI-LABEL: v_icmp_i1_ne0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_gt_u32 s2, 1 ; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 @@ -1986,7 +1986,7 @@ define amdgpu_kernel void @v_icmp_i1_ne0(ptr addrspace(1) %out, i32 %a, i32 %b) ; ; GFX9-LABEL: v_icmp_i1_ne0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_cmp_gt_u32 s6, 1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll index b0706025f0b68..3168e05b816be 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll @@ -14,12 +14,13 @@ entry: define amdgpu_kernel void @test_iglp_opt_mfma_gemm(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { ; GCN-LABEL: test_iglp_opt_mfma_gemm: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GCN-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 2.0 ; GCN-NEXT: ; iglp_opt mask(0x00000000) ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_add_u32_e32 v1, s2, v0 +; GCN-NEXT: v_add_u32_e32 v1, s0, v0 ; GCN-NEXT: v_add_u32_e32 v2, 0x6000, v1 ; GCN-NEXT: ds_read_b128 a[28:31], v2 offset:57456 ; GCN-NEXT: ds_read_b128 a[24:27], v2 offset:57440 @@ -44,7 +45,7 @@ define amdgpu_kernel void @test_iglp_opt_mfma_gemm(ptr addrspace(3) noalias %in, ; GCN-NEXT: ds_read_b128 a[152:155], v1 offset:96 ; GCN-NEXT: ds_read_b128 a[68:71], v1 offset:24592 ; GCN-NEXT: ds_read_b128 a[64:67], v1 offset:24576 -; GCN-NEXT: v_add_u32_e32 v0, s3, v0 +; GCN-NEXT: v_add_u32_e32 v0, s1, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(4) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v3, a[32:63] ; GCN-NEXT: ds_read_b128 a[148:151], v1 offset:80 @@ -80,7 +81,7 @@ define amdgpu_kernel void @test_iglp_opt_mfma_gemm(ptr addrspace(3) noalias %in, ; GCN-NEXT: ds_write_b128 v0, a[136:139] offset:32 ; GCN-NEXT: ds_write_b128 v0, a[132:135] offset:16 ; GCN-NEXT: ds_write_b128 v0, a[128:131] -; GCN-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: s_waitcnt lgkmcnt(8) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v3, a[64:95] ; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:24672 @@ -151,13 +152,13 @@ entry: define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { ; GCN-LABEL: test_iglp_opt_rev_mfma_gemm: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GCN-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 ; GCN-NEXT: v_mov_b32_e32 v2, 1.0 ; GCN-NEXT: v_mov_b32_e32 v3, 2.0 -; GCN-NEXT: ; iglp_opt mask(0x00000001) ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_add_u32_e32 v1, s2, v0 +; GCN-NEXT: v_add_u32_e32 v1, s0, v0 ; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:112 ; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:96 ; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:80 @@ -176,7 +177,8 @@ define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(ptr addrspace(3) noalias ; GCN-NEXT: ds_read_b128 a[136:139], v1 offset:8224 ; GCN-NEXT: ds_read_b128 a[132:135], v1 offset:8208 ; GCN-NEXT: ds_read_b128 a[128:131], v1 offset:8192 -; GCN-NEXT: v_add_u32_e32 v0, s3, v0 +; GCN-NEXT: v_add_u32_e32 v0, s1, v0 +; GCN-NEXT: ; iglp_opt mask(0x00000001) ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v3, a[128:159] ; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:24688 @@ -218,7 +220,7 @@ define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(ptr addrspace(3) noalias ; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32 ; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:16 ; GCN-NEXT: ds_write_b128 v0, a[0:3] -; GCN-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: ds_write_b128 v0, a[152:155] offset:8288 ; GCN-NEXT: ds_write_b128 v0, a[156:159] offset:8304 ; GCN-NEXT: ds_write_b128 v0, a[144:147] offset:8256 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll index 74d936f8093dc..4930317143a76 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll @@ -163,7 +163,7 @@ main_body: define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> inreg %tdescr) { ; GFX1013-LABEL: image_bvh_intersect_ray_nsa_reassign: ; GFX1013: ; %bb.0: ; %main_body -; GFX1013-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; GFX1013-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX1013-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1013-NEXT: v_mov_b32_e32 v6, 4.0 ; GFX1013-NEXT: v_mov_b32_e32 v7, 0x40a00000 @@ -189,7 +189,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; ; GFX1030-LABEL: image_bvh_intersect_ray_nsa_reassign: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX1030-NEXT: v_mov_b32_e32 v10, 0x41000000 ; GFX1030-NEXT: v_mov_b32_e32 v9, 0x40e00000 @@ -215,11 +215,13 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; ; GFX11-LABEL: image_bvh_intersect_ray_nsa_reassign: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v5, 0x40a00000 :: v_dual_lshlrev_b32 v2, 2, v0 -; GFX11-NEXT: v_mov_b32_e32 v6, 0 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v7, 1.0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: v_dual_mov_b32 v5, 0x40a00000 :: v_dual_mov_b32 v6, 0 ; GFX11-NEXT: v_mov_b32_e32 v8, 2.0 -; GFX11-NEXT: v_dual_mov_b32 v4, 4.0 :: v_dual_mov_b32 v7, 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v4, 4.0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v0, s0, s0, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -228,8 +230,8 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s0 ; GFX11-NEXT: flat_load_b32 v9, v[0:1] ; GFX11-NEXT: flat_load_b32 v10, v[2:3] -; GFX11-NEXT: v_mov_b32_e32 v0, 0x40c00000 ; GFX11-NEXT: v_mov_b32_e32 v1, 0x40e00000 +; GFX11-NEXT: v_mov_b32_e32 v0, 0x40c00000 ; GFX11-NEXT: v_mov_b32_e32 v2, 0x41000000 ; GFX11-NEXT: v_mov_b32_e32 v3, 0x40400000 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -260,7 +262,7 @@ main_body: define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> inreg %tdescr) { ; GFX1013-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: ; GFX1013: ; %bb.0: ; %main_body -; GFX1013-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; GFX1013-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX1013-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1013-NEXT: v_mov_b32_e32 v6, 0x46004500 ; GFX1013-NEXT: v_mov_b32_e32 v7, 0x48004700 @@ -283,7 +285,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ ; ; GFX1030-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX1030-NEXT: v_mov_b32_e32 v4, 2.0 ; GFX1030-NEXT: v_mov_b32_e32 v5, 0x44004200 @@ -306,21 +308,22 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ ; ; GFX11-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_dual_mov_b32 v4, 1.0 :: v_dual_mov_b32 v5, 2.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_u32 v0, s0, s0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 ; GFX11-NEXT: v_add_co_u32 v2, s0, s2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s0 ; GFX11-NEXT: flat_load_b32 v6, v[0:1] ; GFX11-NEXT: flat_load_b32 v7, v[2:3] ; GFX11-NEXT: v_mov_b32_e32 v1, 0x47004400 -; GFX11-NEXT: v_dual_mov_b32 v0, 0x46004200 :: v_dual_mov_b32 v3, 0 -; GFX11-NEXT: v_mov_b32_e32 v2, 0x48004500 +; GFX11-NEXT: v_mov_b32_e32 v0, 0x46004200 +; GFX11-NEXT: v_dual_mov_b32 v2, 0x48004500 :: v_dual_mov_b32 v3, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v6, v7, v[3:5], v[0:2]], s[4:7] a16 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -349,9 +352,9 @@ main_body: define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 x i32> inreg %tdescr) { ; GFX1013-LABEL: image_bvh64_intersect_ray_nsa_reassign: ; GFX1013: ; %bb.0: ; %main_body -; GFX1013-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX1013-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX1013-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX1013-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX1013-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x34 ; GFX1013-NEXT: v_mov_b32_e32 v3, 0 ; GFX1013-NEXT: v_mov_b32_e32 v4, 1.0 ; GFX1013-NEXT: v_mov_b32_e32 v5, 2.0 @@ -366,7 +369,7 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 ; GFX1013-NEXT: v_add_co_ci_u32_e64 v1, s4, s5, 0, s4 ; GFX1013-NEXT: flat_load_dword v2, v[0:1] ; GFX1013-NEXT: v_mov_b32_e32 v0, 0xb36211c7 -; GFX1013-NEXT: v_mov_b32_e32 v1, 0x102 +; GFX1013-NEXT: v_bfrev_b32_e32 v1, 4.0 ; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[0:3] ; GFX1013-NEXT: s_waitcnt vmcnt(0) @@ -375,9 +378,9 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 ; ; GFX1030-LABEL: image_bvh64_intersect_ray_nsa_reassign: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x34 ; GFX1030-NEXT: v_mov_b32_e32 v3, 0 ; GFX1030-NEXT: v_mov_b32_e32 v11, 0x41000000 ; GFX1030-NEXT: v_mov_b32_e32 v10, 0x40e00000 @@ -391,7 +394,7 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 ; GFX1030-NEXT: v_add_co_u32 v0, s4, s4, v0 ; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s4 ; GFX1030-NEXT: flat_load_dword v2, v[0:1] -; GFX1030-NEXT: v_mov_b32_e32 v1, 0x102 +; GFX1030-NEXT: v_bfrev_b32_e32 v1, 4.0 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0xb36211c7 ; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[0:3] @@ -401,14 +404,16 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 ; ; GFX11-LABEL: image_bvh64_intersect_ray_nsa_reassign: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v7, 1.0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v2, 0x41000000 ; GFX11-NEXT: v_dual_mov_b32 v3, 0x40400000 :: v_dual_mov_b32 v4, 4.0 -; GFX11-NEXT: v_dual_mov_b32 v5, 0x40a00000 :: v_dual_mov_b32 v6, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_dual_mov_b32 v5, 0x40a00000 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v6, 0 ; GFX11-NEXT: v_dual_mov_b32 v8, 2.0 :: v_dual_mov_b32 v9, 0xb36211c7 -; GFX11-NEXT: v_dual_mov_b32 v10, 0x102 :: v_dual_mov_b32 v7, 1.0 +; GFX11-NEXT: v_bfrev_b32_e32 v10, 4.0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v0, s4, s4, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -442,9 +447,9 @@ main_body: define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray, <4 x i32> inreg %tdescr) { ; GFX1013-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign: ; GFX1013: ; %bb.0: ; %main_body -; GFX1013-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX1013-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX1013-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX1013-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX1013-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x34 ; GFX1013-NEXT: v_mov_b32_e32 v3, 0 ; GFX1013-NEXT: v_mov_b32_e32 v4, 1.0 ; GFX1013-NEXT: v_mov_b32_e32 v5, 2.0 @@ -456,7 +461,7 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray ; GFX1013-NEXT: v_add_co_ci_u32_e64 v1, s4, s5, 0, s4 ; GFX1013-NEXT: flat_load_dword v2, v[0:1] ; GFX1013-NEXT: v_mov_b32_e32 v0, 0xb36211c6 -; GFX1013-NEXT: v_mov_b32_e32 v1, 0x102 +; GFX1013-NEXT: v_bfrev_b32_e32 v1, 4.0 ; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[0:3] a16 ; GFX1013-NEXT: s_waitcnt vmcnt(0) @@ -465,9 +470,9 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray ; ; GFX1030-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x34 ; GFX1030-NEXT: v_mov_b32_e32 v3, 0 ; GFX1030-NEXT: v_mov_b32_e32 v5, 2.0 ; GFX1030-NEXT: v_mov_b32_e32 v4, 1.0 @@ -478,7 +483,7 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray ; GFX1030-NEXT: v_add_co_u32 v0, s4, s4, v0 ; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s4 ; GFX1030-NEXT: flat_load_dword v2, v[0:1] -; GFX1030-NEXT: v_mov_b32_e32 v1, 0x102 +; GFX1030-NEXT: v_bfrev_b32_e32 v1, 4.0 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0xb36211c6 ; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[0:3] a16 @@ -488,19 +493,23 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray ; ; GFX11-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v2, 0x48004500 +; GFX11-NEXT: v_mov_b32_e32 v4, 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34 -; GFX11-NEXT: v_dual_mov_b32 v2, 0x48004500 :: v_dual_mov_b32 v5, 2.0 -; GFX11-NEXT: v_dual_mov_b32 v4, 1.0 :: v_dual_mov_b32 v7, 0x102 -; GFX11-NEXT: v_dual_mov_b32 v6, 0xb36211c6 :: v_dual_mov_b32 v3, 0 +; GFX11-NEXT: v_mov_b32_e32 v6, 0xb36211c6 +; GFX11-NEXT: v_bfrev_b32_e32 v7, 4.0 +; GFX11-NEXT: v_mov_b32_e32 v5, 2.0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v0, s4, s4, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s4 ; GFX11-NEXT: flat_load_b32 v8, v[0:1] -; GFX11-NEXT: v_mov_b32_e32 v0, 0x46004200 ; GFX11-NEXT: v_mov_b32_e32 v1, 0x47004400 +; GFX11-NEXT: v_mov_b32_e32 v0, 0x46004200 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[6:7], v8, v[3:5], v[0:2]], s[0:3] a16 ; GFX11-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll index cd92529b77165..430fc46592553 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll @@ -15,20 +15,20 @@ define amdgpu_kernel void @v_permlane16_b32_vss(ptr addrspace(1) %out, i32 %src0 ; GFX10-LABEL: v_permlane16_b32_vss: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s2 +; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s0 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_vss: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -41,8 +41,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss(ptr addrspace(1) %out, i32 %src0 ; GFX12-LABEL: v_permlane16_b32_vss: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -60,22 +60,22 @@ define amdgpu_kernel void @v_permlane16_b32_vii(ptr addrspace(1) %out, i32 %src0 ; GFX10-LABEL: v_permlane16_b32_vii: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_permlane16_b32 v0, v0, 1, 2 -; GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_vii: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlane16_b32 v0, v0, 1, 2 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -85,7 +85,7 @@ define amdgpu_kernel void @v_permlane16_b32_vii(ptr addrspace(1) %out, i32 %src0 ; ; GFX12-LABEL: v_permlane16_b32_vii: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -104,25 +104,25 @@ define amdgpu_kernel void @v_permlane16_b32_vll(ptr addrspace(1) %out, i32 %src0 ; GFX10-LABEL: v_permlane16_b32_vll: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10-NEXT: s_movk_i32 s0, 0x1234 +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_movk_i32 s2, 0x1234 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, 0xc1d1 -; GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_vll: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_movk_i32 s2, 0x1234 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -131,7 +131,7 @@ define amdgpu_kernel void @v_permlane16_b32_vll(ptr addrspace(1) %out, i32 %src0 ; ; GFX12-LABEL: v_permlane16_b32_vll: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: s_movk_i32 s2, 0x1234 @@ -150,33 +150,33 @@ define amdgpu_kernel void @v_permlane16_b32_vvv(ptr addrspace(1) %out, i32 %src0 ; GFX10-LABEL: v_permlane16_b32_vvv: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: s_mov_b32 null, 0 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10-NEXT: v_readfirstlane_b32 s3, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vvv: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v1 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v0 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -185,17 +185,17 @@ define amdgpu_kernel void @v_permlane16_b32_vvv(ptr addrspace(1) %out, i32 %src0 ; GFX11-GISEL-LABEL: v_permlane16_b32_vvv: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v1 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -203,7 +203,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvv(ptr addrspace(1) %out, i32 %src0 ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vvv: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) @@ -221,7 +221,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvv(ptr addrspace(1) %out, i32 %src0 ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vvv: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) @@ -245,8 +245,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvv(ptr addrspace(1) %out, i32 %src0 define amdgpu_kernel void @v_permlane16_b32_vvs(ptr addrspace(1) %out, i32 %src0, i32 %src2) { ; GFX10-SDAG-LABEL: v_permlane16_b32_vvs: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-SDAG-NEXT: s_mov_b32 null, 0 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-SDAG-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -257,8 +256,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvs(ptr addrspace(1) %out, i32 %src0 ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vvs: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-GISEL-NEXT: s_mov_b32 null, 0 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -269,12 +267,12 @@ define amdgpu_kernel void @v_permlane16_b32_vvs(ptr addrspace(1) %out, i32 %src0 ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vvs: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 @@ -283,11 +281,12 @@ define amdgpu_kernel void @v_permlane16_b32_vvs(ptr addrspace(1) %out, i32 %src0 ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vvs: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s3 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 @@ -296,12 +295,12 @@ define amdgpu_kernel void @v_permlane16_b32_vvs(ptr addrspace(1) %out, i32 %src0 ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vvs: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 @@ -310,11 +309,12 @@ define amdgpu_kernel void @v_permlane16_b32_vvs(ptr addrspace(1) %out, i32 %src0 ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vvs: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s3 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 @@ -329,8 +329,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvs(ptr addrspace(1) %out, i32 %src0 define amdgpu_kernel void @v_permlane16_b32_vsv(ptr addrspace(1) %out, i32 %src0, i32 %src1) { ; GFX10-LABEL: v_permlane16_b32_vsv: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_mov_b32 null, 0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_readfirstlane_b32 s0, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -341,7 +340,7 @@ define amdgpu_kernel void @v_permlane16_b32_vsv(ptr addrspace(1) %out, i32 %src0 ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vsv: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 @@ -356,7 +355,7 @@ define amdgpu_kernel void @v_permlane16_b32_vsv(ptr addrspace(1) %out, i32 %src0 ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vsv: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -371,7 +370,7 @@ define amdgpu_kernel void @v_permlane16_b32_vsv(ptr addrspace(1) %out, i32 %src0 ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vsv: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 @@ -386,7 +385,7 @@ define amdgpu_kernel void @v_permlane16_b32_vsv(ptr addrspace(1) %out, i32 %src0 ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vsv: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -408,20 +407,20 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi(ptr addrspace(1) %out, i32 %s ; GFX10-LABEL: v_permlane16_b32_vss_fi: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s2 op_sel:[1,0] +; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,0] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_vss_fi: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -434,8 +433,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi(ptr addrspace(1) %out, i32 %s ; GFX12-LABEL: v_permlane16_b32_vss_fi: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -453,20 +452,20 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc(ptr addrspace(1) %out, i32 %s ; GFX10-LABEL: v_permlane16_b32_vss_bc: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s2 op_sel:[0,1] +; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[0,1] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_vss_bc: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -479,8 +478,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc(ptr addrspace(1) %out, i32 %s ; GFX12-LABEL: v_permlane16_b32_vss_bc: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -498,20 +497,20 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc(ptr addrspace(1) %out, i32 ; GFX10-LABEL: v_permlane16_b32_vss_fi_bc: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s2 op_sel:[1,1] +; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,1] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_vss_fi_bc: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -524,8 +523,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc(ptr addrspace(1) %out, i32 ; GFX12-LABEL: v_permlane16_b32_vss_fi_bc: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -543,20 +542,20 @@ define amdgpu_kernel void @v_permlanex16_b32_vss(ptr addrspace(1) %out, i32 %src ; GFX10-LABEL: v_permlanex16_b32_vss: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s2 +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s0 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_vss: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -569,8 +568,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss(ptr addrspace(1) %out, i32 %src ; GFX12-LABEL: v_permlanex16_b32_vss: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -588,22 +587,22 @@ define amdgpu_kernel void @v_permlanex16_b32_vii(ptr addrspace(1) %out, i32 %src ; GFX10-LABEL: v_permlanex16_b32_vii: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_permlanex16_b32 v0, v0, 1, 2 -; GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_vii: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlanex16_b32 v0, v0, 1, 2 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -613,7 +612,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vii(ptr addrspace(1) %out, i32 %src ; ; GFX12-LABEL: v_permlanex16_b32_vii: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -632,25 +631,25 @@ define amdgpu_kernel void @v_permlanex16_b32_vll(ptr addrspace(1) %out, i32 %src ; GFX10-LABEL: v_permlanex16_b32_vll: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10-NEXT: s_movk_i32 s0, 0x1234 +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_movk_i32 s2, 0x1234 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, 0xc1d1 -; GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_vll: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_movk_i32 s2, 0x1234 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -659,7 +658,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vll(ptr addrspace(1) %out, i32 %src ; ; GFX12-LABEL: v_permlanex16_b32_vll: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: s_movk_i32 s2, 0x1234 @@ -678,33 +677,33 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv(ptr addrspace(1) %out, i32 %src ; GFX10-LABEL: v_permlanex16_b32_vvv: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: s_mov_b32 null, 0 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10-NEXT: v_readfirstlane_b32 s3, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vvv: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v1 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v0 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -713,17 +712,17 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv(ptr addrspace(1) %out, i32 %src ; GFX11-GISEL-LABEL: v_permlanex16_b32_vvv: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v1 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -731,7 +730,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv(ptr addrspace(1) %out, i32 %src ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vvv: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) @@ -749,7 +748,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv(ptr addrspace(1) %out, i32 %src ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vvv: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) @@ -773,8 +772,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv(ptr addrspace(1) %out, i32 %src define amdgpu_kernel void @v_permlanex16_b32_vvs(ptr addrspace(1) %out, i32 %src0, i32 %src2) { ; GFX10-SDAG-LABEL: v_permlanex16_b32_vvs: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-SDAG-NEXT: s_mov_b32 null, 0 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-SDAG-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -785,8 +783,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs(ptr addrspace(1) %out, i32 %src ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vvs: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-GISEL-NEXT: s_mov_b32 null, 0 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -797,12 +794,12 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs(ptr addrspace(1) %out, i32 %src ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vvs: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 @@ -811,11 +808,12 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs(ptr addrspace(1) %out, i32 %src ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vvs: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s3 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 @@ -824,12 +822,12 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs(ptr addrspace(1) %out, i32 %src ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vvs: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 @@ -838,11 +836,12 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs(ptr addrspace(1) %out, i32 %src ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vvs: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s3 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 @@ -857,8 +856,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs(ptr addrspace(1) %out, i32 %src define amdgpu_kernel void @v_permlanex16_b32_vsv(ptr addrspace(1) %out, i32 %src0, i32 %src1) { ; GFX10-LABEL: v_permlanex16_b32_vsv: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_mov_b32 null, 0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_readfirstlane_b32 s0, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -869,7 +867,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv(ptr addrspace(1) %out, i32 %src ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vsv: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 @@ -884,7 +882,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv(ptr addrspace(1) %out, i32 %src ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vsv: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -899,7 +897,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv(ptr addrspace(1) %out, i32 %src ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vsv: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 @@ -914,7 +912,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv(ptr addrspace(1) %out, i32 %src ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vsv: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -936,20 +934,20 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi(ptr addrspace(1) %out, i32 % ; GFX10-LABEL: v_permlanex16_b32_vss_fi: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s2 op_sel:[1,0] +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,0] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_vss_fi: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -962,8 +960,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi(ptr addrspace(1) %out, i32 % ; GFX12-LABEL: v_permlanex16_b32_vss_fi: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -981,20 +979,20 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc(ptr addrspace(1) %out, i32 % ; GFX10-LABEL: v_permlanex16_b32_vss_bc: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s2 op_sel:[0,1] +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[0,1] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_vss_bc: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1007,8 +1005,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc(ptr addrspace(1) %out, i32 % ; GFX12-LABEL: v_permlanex16_b32_vss_bc: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1026,20 +1024,20 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc(ptr addrspace(1) %out, i3 ; GFX10-LABEL: v_permlanex16_b32_vss_fi_bc: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s2 op_sel:[1,1] +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,1] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_vss_fi_bc: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1052,8 +1050,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc(ptr addrspace(1) %out, i3 ; GFX12-LABEL: v_permlanex16_b32_vss_fi_bc: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1071,23 +1069,24 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid(ptr addrspace(1) %out, i32 % ; GFX10-LABEL: v_permlane16_b32_tid_tid: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_tid_tid: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1095,12 +1094,13 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid(ptr addrspace(1) %out, i32 % ; GFX12-LABEL: v_permlane16_b32_tid_tid: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 -; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -1114,23 +1114,24 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid(ptr addrspace(1) %out, i32 ; GFX10-LABEL: v_permlane16_b32_undef_tid: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_undef_tid: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1138,12 +1139,13 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid(ptr addrspace(1) %out, i32 ; GFX12-LABEL: v_permlane16_b32_undef_tid: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 -; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -1158,23 +1160,23 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid(ptr addrspace(1) %out, i32 %sr ; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x3039 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v0, s2, s3 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v0, s0, s1 ; GFX10-SDAG-NEXT: global_store_dword v2, v1, s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v0, s2, s3 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v0, s0, s1 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm @@ -1182,13 +1184,14 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid(ptr addrspace(1) %out, i32 %sr ; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v0, s2, s3 -; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[0:1] +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v0, s0, s1 +; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[2:3] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm @@ -1196,14 +1199,15 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid(ptr addrspace(1) %out, i32 %sr ; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v0, s2, s3 +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v0, s0, s1 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[2:3] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm @@ -1211,13 +1215,14 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid(ptr addrspace(1) %out, i32 %sr ; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v0, s2, s3 -; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[0:1] +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v0, s0, s1 +; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[2:3] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm @@ -1225,14 +1230,15 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid(ptr addrspace(1) %out, i32 %sr ; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v0, s2, s3 +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v0, s0, s1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[2:3] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -1246,23 +1252,24 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi(ptr addrspace(1) %out, i32 ; GFX10-LABEL: v_permlane16_b32_i_tid_fi: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_i_tid_fi: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1270,12 +1277,13 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi(ptr addrspace(1) %out, i32 ; GFX12-LABEL: v_permlane16_b32_i_tid_fi: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] -; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -1290,23 +1298,24 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc(ptr addrspace(1) %out, i32 ; GFX10-LABEL: v_permlane16_b32_i_tid_bc: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_i_tid_bc: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1314,12 +1323,13 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc(ptr addrspace(1) %out, i32 ; GFX12-LABEL: v_permlane16_b32_i_tid_bc: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] -; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -1334,23 +1344,24 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc(ptr addrspace(1) %out, i ; GFX10-LABEL: v_permlane16_b32_i_tid_fi_bc: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_i_tid_fi_bc: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1358,12 +1369,13 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc(ptr addrspace(1) %out, i ; GFX12-LABEL: v_permlane16_b32_i_tid_fi_bc: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] -; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -1378,23 +1390,24 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid(ptr addrspace(1) %out, i32 ; GFX10-LABEL: v_permlanex16_b32_tid_tid: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_tid_tid: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1402,12 +1415,13 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid(ptr addrspace(1) %out, i32 ; GFX12-LABEL: v_permlanex16_b32_tid_tid: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 -; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -1421,23 +1435,24 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid(ptr addrspace(1) %out, i3 ; GFX10-LABEL: v_permlanex16_b32_undef_tid: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_undef_tid: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1445,12 +1460,13 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid(ptr addrspace(1) %out, i3 ; GFX12-LABEL: v_permlanex16_b32_undef_tid: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 -; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -1465,23 +1481,23 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid(ptr addrspace(1) %out, i32 %s ; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x3039 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v0, s2, s3 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v0, s0, s1 ; GFX10-SDAG-NEXT: global_store_dword v2, v1, s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v0, s2, s3 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v0, s0, s1 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm @@ -1489,13 +1505,14 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid(ptr addrspace(1) %out, i32 %s ; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v0, s2, s3 -; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[0:1] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v0, s0, s1 +; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[2:3] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm @@ -1503,14 +1520,15 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid(ptr addrspace(1) %out, i32 %s ; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v0, s2, s3 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v0, s0, s1 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[2:3] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm @@ -1518,13 +1536,14 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid(ptr addrspace(1) %out, i32 %s ; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v0, s2, s3 -; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[0:1] +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v0, s0, s1 +; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[2:3] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm @@ -1532,14 +1551,15 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid(ptr addrspace(1) %out, i32 %s ; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v0, s2, s3 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v0, s0, s1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[2:3] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -1553,23 +1573,24 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi(ptr addrspace(1) %out, i32 ; GFX10-LABEL: v_permlanex16_b32_i_tid_fi: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_i_tid_fi: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1577,12 +1598,13 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi(ptr addrspace(1) %out, i32 ; GFX12-LABEL: v_permlanex16_b32_i_tid_fi: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] -; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -1597,23 +1619,24 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc(ptr addrspace(1) %out, i32 ; GFX10-LABEL: v_permlanex16_b32_i_tid_bc: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_i_tid_bc: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1621,12 +1644,13 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc(ptr addrspace(1) %out, i32 ; GFX12-LABEL: v_permlanex16_b32_i_tid_bc: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] -; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -1641,23 +1665,24 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc(ptr addrspace(1) %out, ; GFX10-LABEL: v_permlanex16_b32_i_tid_fi_bc: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_i_tid_fi_bc: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1665,12 +1690,13 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc(ptr addrspace(1) %out, ; GFX12-LABEL: v_permlanex16_b32_i_tid_fi_bc: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] -; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll index 973678291e263..a65143255bbb4 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll @@ -10,7 +10,7 @@ declare i32 @llvm.amdgcn.workitem.id.y() define amdgpu_kernel void @v_permlane16var_b32_vv(ptr addrspace(1) %out, i32 %src0, i32 %src1) { ; GFX12-SDAG-LABEL: v_permlane16var_b32_vv: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -23,7 +23,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vv(ptr addrspace(1) %out, i32 %sr ; ; GFX12-GISEL-LABEL: v_permlane16var_b32_vv: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -41,7 +41,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vv(ptr addrspace(1) %out, i32 %sr define amdgpu_kernel void @v_permlane16var_b32_vi(ptr addrspace(1) %out, i32 %src0) { ; GFX12-SDAG-LABEL: v_permlane16var_b32_vi: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 1 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 @@ -54,7 +54,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vi(ptr addrspace(1) %out, i32 %sr ; ; GFX12-GISEL-LABEL: v_permlane16var_b32_vi: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -72,7 +72,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vi(ptr addrspace(1) %out, i32 %sr define amdgpu_kernel void @v_permlane16var_b32_vl(ptr addrspace(1) %out, i32 %src0) { ; GFX12-SDAG-LABEL: v_permlane16var_b32_vl: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0xc1d1 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 @@ -85,7 +85,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vl(ptr addrspace(1) %out, i32 %sr ; ; GFX12-GISEL-LABEL: v_permlane16var_b32_vl: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, 0xc1d1 :: v_dual_mov_b32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -103,7 +103,8 @@ define amdgpu_kernel void @v_permlane16var_b32_vl(ptr addrspace(1) %out, i32 %sr define amdgpu_kernel void @v_permlane16var_b32_vvv(ptr addrspace(1) %out, i32 %src0) { ; GFX12-SDAG-LABEL: v_permlane16var_b32_vvv: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -115,9 +116,9 @@ define amdgpu_kernel void @v_permlane16var_b32_vvv(ptr addrspace(1) %out, i32 %s ; ; GFX12-GISEL-LABEL: v_permlane16var_b32_vvv: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v1, v1, v0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -134,7 +135,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vvv(ptr addrspace(1) %out, i32 %s define amdgpu_kernel void @v_permlane16var_b32_vv_fi(ptr addrspace(1) %out, i32 %src0, i32 %src1) { ; GFX12-SDAG-LABEL: v_permlane16var_b32_vv_fi: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -147,7 +148,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vv_fi(ptr addrspace(1) %out, i32 ; ; GFX12-GISEL-LABEL: v_permlane16var_b32_vv_fi: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -165,7 +166,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vv_fi(ptr addrspace(1) %out, i32 define amdgpu_kernel void @v_permlane16var_b32_vv_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1) { ; GFX12-SDAG-LABEL: v_permlane16var_b32_vv_bc: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -178,7 +179,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vv_bc(ptr addrspace(1) %out, i32 ; ; GFX12-GISEL-LABEL: v_permlane16var_b32_vv_bc: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -196,7 +197,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vv_bc(ptr addrspace(1) %out, i32 define amdgpu_kernel void @v_permlane16var_b32_vv_fi_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1) { ; GFX12-SDAG-LABEL: v_permlane16var_b32_vv_fi_bc: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -209,7 +210,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vv_fi_bc(ptr addrspace(1) %out, i ; ; GFX12-GISEL-LABEL: v_permlane16var_b32_vv_fi_bc: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -227,7 +228,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vv_fi_bc(ptr addrspace(1) %out, i define amdgpu_kernel void @v_permlanex16var_b32_vv(ptr addrspace(1) %out, i32 %src0, i32 %src1) { ; GFX12-SDAG-LABEL: v_permlanex16var_b32_vv: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -240,7 +241,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv(ptr addrspace(1) %out, i32 %s ; ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vv: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -258,7 +259,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv(ptr addrspace(1) %out, i32 %s define amdgpu_kernel void @v_permlanex16var_b32_vi(ptr addrspace(1) %out, i32 %src0) { ; GFX12-SDAG-LABEL: v_permlanex16var_b32_vi: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 1 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 @@ -271,7 +272,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vi(ptr addrspace(1) %out, i32 %s ; ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vi: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -289,7 +290,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vi(ptr addrspace(1) %out, i32 %s define amdgpu_kernel void @v_permlanex16var_b32_vl(ptr addrspace(1) %out, i32 %src0) { ; GFX12-SDAG-LABEL: v_permlanex16var_b32_vl: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0xc1d1 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 @@ -302,7 +303,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vl(ptr addrspace(1) %out, i32 %s ; ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vl: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, 0xc1d1 :: v_dual_mov_b32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -320,7 +321,8 @@ define amdgpu_kernel void @v_permlanex16var_b32_vl(ptr addrspace(1) %out, i32 %s define amdgpu_kernel void @v_permlanex16var_b32_vvv(ptr addrspace(1) %out, i32 %src0) { ; GFX12-SDAG-LABEL: v_permlanex16var_b32_vvv: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -332,9 +334,9 @@ define amdgpu_kernel void @v_permlanex16var_b32_vvv(ptr addrspace(1) %out, i32 % ; ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vvv: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v1, v1, v0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -351,7 +353,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vvv(ptr addrspace(1) %out, i32 % define amdgpu_kernel void @v_permlanex16var_b32_vv_fi(ptr addrspace(1) %out, i32 %src0, i32 %src1) { ; GFX12-SDAG-LABEL: v_permlanex16var_b32_vv_fi: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -364,7 +366,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv_fi(ptr addrspace(1) %out, i32 ; ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vv_fi: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -382,7 +384,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv_fi(ptr addrspace(1) %out, i32 define amdgpu_kernel void @v_permlanex16var_b32_vv_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1) { ; GFX12-SDAG-LABEL: v_permlanex16var_b32_vv_bc: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -395,7 +397,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv_bc(ptr addrspace(1) %out, i32 ; ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vv_bc: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -413,7 +415,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv_bc(ptr addrspace(1) %out, i32 define amdgpu_kernel void @v_permlanex16var_b32_vv_fi_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1) { ; GFX12-SDAG-LABEL: v_permlanex16var_b32_vv_fi_bc: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -426,7 +428,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv_fi_bc(ptr addrspace(1) %out, ; ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vv_fi_bc: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -445,10 +447,11 @@ define amdgpu_kernel void @v_permlane16var_b32_tid_tid(ptr addrspace(1) %out, i3 ; GFX12-SDAG-LABEL: v_permlane16var_b32_tid_tid: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] @@ -459,10 +462,10 @@ define amdgpu_kernel void @v_permlane16var_b32_tid_tid(ptr addrspace(1) %out, i3 ; GFX12-GISEL-LABEL: v_permlane16var_b32_tid_tid: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -480,10 +483,11 @@ define amdgpu_kernel void @v_permlane16var_b32_undef_tid(ptr addrspace(1) %out, ; GFX12-SDAG-LABEL: v_permlane16var_b32_undef_tid: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] @@ -494,10 +498,10 @@ define amdgpu_kernel void @v_permlane16var_b32_undef_tid(ptr addrspace(1) %out, ; GFX12-GISEL-LABEL: v_permlane16var_b32_undef_tid: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -516,11 +520,11 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid(ptr addrspace(1) %out, i32 ; GFX12-SDAG-LABEL: v_permlane16var_b32_i_tid: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0x3039 +; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v2, s2 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, s4 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_var_b32 v1, v0, v2 ; GFX12-SDAG-NEXT: global_store_b32 v3, v1, s[0:1] @@ -531,10 +535,12 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid(ptr addrspace(1) %out, i32 ; GFX12-GISEL-LABEL: v_permlane16var_b32_i_tid: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, s2 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v1, v0, v2 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -552,10 +558,11 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_fi(ptr addrspace(1) %out, i ; GFX12-SDAG-LABEL: v_permlane16var_b32_i_tid_fi: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[1,0] ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] @@ -566,10 +573,10 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_fi(ptr addrspace(1) %out, i ; GFX12-GISEL-LABEL: v_permlane16var_b32_i_tid_fi: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[1,0] ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -588,10 +595,11 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_bc(ptr addrspace(1) %out, i ; GFX12-SDAG-LABEL: v_permlane16var_b32_i_tid_bc: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[0,1] ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] @@ -602,10 +610,10 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_bc(ptr addrspace(1) %out, i ; GFX12-GISEL-LABEL: v_permlane16var_b32_i_tid_bc: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[0,1] ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -624,10 +632,11 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_fi_bc(ptr addrspace(1) %out ; GFX12-SDAG-LABEL: v_permlane16var_b32_i_tid_fi_bc: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[1,1] ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] @@ -638,10 +647,10 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_fi_bc(ptr addrspace(1) %out ; GFX12-GISEL-LABEL: v_permlane16var_b32_i_tid_fi_bc: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[1,1] ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -660,10 +669,11 @@ define amdgpu_kernel void @v_permlanex16var_b32_tid_tid(ptr addrspace(1) %out, i ; GFX12-SDAG-LABEL: v_permlanex16var_b32_tid_tid: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] @@ -674,10 +684,10 @@ define amdgpu_kernel void @v_permlanex16var_b32_tid_tid(ptr addrspace(1) %out, i ; GFX12-GISEL-LABEL: v_permlanex16var_b32_tid_tid: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -695,10 +705,11 @@ define amdgpu_kernel void @v_permlanex16var_b32_undef_tid(ptr addrspace(1) %out, ; GFX12-SDAG-LABEL: v_permlanex16var_b32_undef_tid: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] @@ -709,10 +720,10 @@ define amdgpu_kernel void @v_permlanex16var_b32_undef_tid(ptr addrspace(1) %out, ; GFX12-GISEL-LABEL: v_permlanex16var_b32_undef_tid: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -731,11 +742,11 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid(ptr addrspace(1) %out, i32 ; GFX12-SDAG-LABEL: v_permlanex16var_b32_i_tid: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0x3039 +; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v2, s2 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, s4 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v1, v0, v2 ; GFX12-SDAG-NEXT: global_store_b32 v3, v1, s[0:1] @@ -746,10 +757,12 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid(ptr addrspace(1) %out, i32 ; GFX12-GISEL-LABEL: v_permlanex16var_b32_i_tid: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, s2 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v1, v0, v2 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -767,10 +780,11 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_fi(ptr addrspace(1) %out, ; GFX12-SDAG-LABEL: v_permlanex16var_b32_i_tid_fi: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,0] ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] @@ -781,10 +795,10 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_fi(ptr addrspace(1) %out, ; GFX12-GISEL-LABEL: v_permlanex16var_b32_i_tid_fi: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,0] ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -803,10 +817,11 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_bc(ptr addrspace(1) %out, ; GFX12-SDAG-LABEL: v_permlanex16var_b32_i_tid_bc: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[0,1] ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] @@ -817,10 +832,10 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_bc(ptr addrspace(1) %out, ; GFX12-GISEL-LABEL: v_permlanex16var_b32_i_tid_bc: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[0,1] ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -839,10 +854,11 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_fi_bc(ptr addrspace(1) %ou ; GFX12-SDAG-LABEL: v_permlanex16var_b32_i_tid_fi_bc: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,1] ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] @@ -853,10 +869,10 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_fi_bc(ptr addrspace(1) %ou ; GFX12-GISEL-LABEL: v_permlanex16var_b32_i_tid_fi_bc: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,1] ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll index 47c021769aa56..d521a6c25e462 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll @@ -10,102 +10,104 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 { ; VARIANT0-LABEL: test_barrier: ; VARIANT0: ; %bb.0: ; %entry -; VARIANT0-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; VARIANT0-NEXT: s_load_dword s0, s[0:1], 0xb -; VARIANT0-NEXT: s_mov_b32 s7, 0xf000 -; VARIANT0-NEXT: s_mov_b32 s6, 0 +; VARIANT0-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; VARIANT0-NEXT: s_load_dword s4, s[2:3], 0xb +; VARIANT0-NEXT: s_mov_b32 s3, 0xf000 +; VARIANT0-NEXT: s_mov_b32 s2, 0 ; VARIANT0-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VARIANT0-NEXT: v_mov_b32_e32 v2, 0 ; VARIANT0-NEXT: v_not_b32_e32 v3, v0 ; VARIANT0-NEXT: s_waitcnt lgkmcnt(0) -; VARIANT0-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 +; VARIANT0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 ; VARIANT0-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VARIANT0-NEXT: s_barrier -; VARIANT0-NEXT: v_add_i32_e32 v3, vcc, s0, v3 +; VARIANT0-NEXT: v_add_i32_e32 v3, vcc, s4, v3 ; VARIANT0-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; VARIANT0-NEXT: v_lshl_b64 v[3:4], v[3:4], 2 -; VARIANT0-NEXT: buffer_load_dword v0, v[3:4], s[4:7], 0 addr64 +; VARIANT0-NEXT: buffer_load_dword v0, v[3:4], s[0:3], 0 addr64 ; VARIANT0-NEXT: s_waitcnt vmcnt(0) -; VARIANT0-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 +; VARIANT0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 ; VARIANT0-NEXT: s_endpgm ; ; VARIANT1-LABEL: test_barrier: ; VARIANT1: ; %bb.0: ; %entry -; VARIANT1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; VARIANT1-NEXT: s_load_dword s0, s[0:1], 0xb -; VARIANT1-NEXT: s_mov_b32 s7, 0xf000 -; VARIANT1-NEXT: s_mov_b32 s6, 0 +; VARIANT1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; VARIANT1-NEXT: s_load_dword s4, s[2:3], 0xb +; VARIANT1-NEXT: s_mov_b32 s3, 0xf000 +; VARIANT1-NEXT: s_mov_b32 s2, 0 ; VARIANT1-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VARIANT1-NEXT: v_mov_b32_e32 v2, 0 ; VARIANT1-NEXT: v_not_b32_e32 v3, v0 ; VARIANT1-NEXT: s_waitcnt lgkmcnt(0) -; VARIANT1-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 +; VARIANT1-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 ; VARIANT1-NEXT: s_barrier -; VARIANT1-NEXT: v_add_i32_e32 v3, vcc, s0, v3 +; VARIANT1-NEXT: v_add_i32_e32 v3, vcc, s4, v3 ; VARIANT1-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; VARIANT1-NEXT: v_lshl_b64 v[3:4], v[3:4], 2 ; VARIANT1-NEXT: s_waitcnt expcnt(0) -; VARIANT1-NEXT: buffer_load_dword v0, v[3:4], s[4:7], 0 addr64 +; VARIANT1-NEXT: buffer_load_dword v0, v[3:4], s[0:3], 0 addr64 ; VARIANT1-NEXT: s_waitcnt vmcnt(0) -; VARIANT1-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 +; VARIANT1-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 ; VARIANT1-NEXT: s_endpgm ; ; VARIANT2-LABEL: test_barrier: ; VARIANT2: ; %bb.0: ; %entry -; VARIANT2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VARIANT2-NEXT: s_load_dword s4, s[0:1], 0x2c +; VARIANT2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VARIANT2-NEXT: s_load_dword s4, s[2:3], 0x2c ; VARIANT2-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VARIANT2-NEXT: s_waitcnt lgkmcnt(0) -; VARIANT2-NEXT: global_store_dword v2, v0, s[2:3] +; VARIANT2-NEXT: global_store_dword v2, v0, s[0:1] ; VARIANT2-NEXT: v_xad_u32 v0, v0, -1, s4 ; VARIANT2-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VARIANT2-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] -; VARIANT2-NEXT: v_mov_b32_e32 v3, s3 -; VARIANT2-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; VARIANT2-NEXT: v_mov_b32_e32 v3, s1 +; VARIANT2-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 ; VARIANT2-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc ; VARIANT2-NEXT: s_waitcnt vmcnt(0) ; VARIANT2-NEXT: s_barrier ; VARIANT2-NEXT: global_load_dword v0, v[0:1], off ; VARIANT2-NEXT: s_waitcnt vmcnt(0) -; VARIANT2-NEXT: global_store_dword v2, v0, s[2:3] +; VARIANT2-NEXT: global_store_dword v2, v0, s[0:1] ; VARIANT2-NEXT: s_endpgm ; ; VARIANT3-LABEL: test_barrier: ; VARIANT3: ; %bb.0: ; %entry -; VARIANT3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VARIANT3-NEXT: s_load_dword s4, s[0:1], 0x2c +; VARIANT3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VARIANT3-NEXT: s_load_dword s4, s[2:3], 0x2c ; VARIANT3-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VARIANT3-NEXT: s_waitcnt lgkmcnt(0) -; VARIANT3-NEXT: global_store_dword v2, v0, s[2:3] +; VARIANT3-NEXT: global_store_dword v2, v0, s[0:1] ; VARIANT3-NEXT: v_xad_u32 v0, v0, -1, s4 ; VARIANT3-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VARIANT3-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] -; VARIANT3-NEXT: v_mov_b32_e32 v3, s3 -; VARIANT3-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; VARIANT3-NEXT: v_mov_b32_e32 v3, s1 +; VARIANT3-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 ; VARIANT3-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc ; VARIANT3-NEXT: s_barrier ; VARIANT3-NEXT: global_load_dword v0, v[0:1], off ; VARIANT3-NEXT: s_waitcnt vmcnt(0) -; VARIANT3-NEXT: global_store_dword v2, v0, s[2:3] +; VARIANT3-NEXT: global_store_dword v2, v0, s[0:1] ; VARIANT3-NEXT: s_endpgm ; ; VARIANT4-LABEL: test_barrier: ; VARIANT4: ; %bb.0: ; %entry -; VARIANT4-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 -; VARIANT4-NEXT: v_lshlrev_b32_e32 v3, 2, v0 +; VARIANT4-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; VARIANT4-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; VARIANT4-NEXT: s_delay_alu instid0(VALU_DEP_1) +; VARIANT4-NEXT: v_lshlrev_b32_e32 v3, 2, v2 ; VARIANT4-NEXT: s_wait_kmcnt 0x0 -; VARIANT4-NEXT: v_xad_u32 v1, v0, -1, s2 -; VARIANT4-NEXT: global_store_b32 v3, v0, s[0:1] +; VARIANT4-NEXT: v_xad_u32 v0, v2, -1, s2 +; VARIANT4-NEXT: global_store_b32 v3, v2, s[0:1] ; VARIANT4-NEXT: s_wait_storecnt 0x0 ; VARIANT4-NEXT: s_barrier_signal -1 ; VARIANT4-NEXT: s_barrier_wait -1 -; VARIANT4-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; VARIANT4-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VARIANT4-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; VARIANT4-NEXT: v_lshlrev_b64_e32 v[1:2], 2, v[1:2] -; VARIANT4-NEXT: v_add_co_u32 v1, vcc_lo, s0, v1 +; VARIANT4-NEXT: v_lshlrev_b64_e32 v[0:1], 2, v[0:1] +; VARIANT4-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0 ; VARIANT4-NEXT: s_delay_alu instid0(VALU_DEP_2) -; VARIANT4-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, s1, v2, vcc_lo -; VARIANT4-NEXT: global_load_b32 v0, v[1:2], off +; VARIANT4-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo +; VARIANT4-NEXT: global_load_b32 v0, v[0:1], off ; VARIANT4-NEXT: s_wait_loadcnt 0x0 ; VARIANT4-NEXT: global_store_b32 v3, v0, s[0:1] ; VARIANT4-NEXT: s_nop 0 @@ -114,20 +116,22 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 { ; ; VARIANT5-LABEL: test_barrier: ; VARIANT5: ; %bb.0: ; %entry -; VARIANT5-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 -; VARIANT5-NEXT: v_lshlrev_b32_e32 v3, 2, v0 +; VARIANT5-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; VARIANT5-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; VARIANT5-NEXT: s_delay_alu instid0(VALU_DEP_1) +; VARIANT5-NEXT: v_lshlrev_b32_e32 v3, 2, v2 ; VARIANT5-NEXT: s_wait_kmcnt 0x0 -; VARIANT5-NEXT: v_xad_u32 v1, v0, -1, s2 -; VARIANT5-NEXT: global_store_b32 v3, v0, s[0:1] +; VARIANT5-NEXT: v_xad_u32 v0, v2, -1, s2 +; VARIANT5-NEXT: global_store_b32 v3, v2, s[0:1] ; VARIANT5-NEXT: s_barrier_signal -1 ; VARIANT5-NEXT: s_barrier_wait -1 -; VARIANT5-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; VARIANT5-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VARIANT5-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; VARIANT5-NEXT: v_lshlrev_b64_e32 v[1:2], 2, v[1:2] -; VARIANT5-NEXT: v_add_co_u32 v1, vcc_lo, s0, v1 +; VARIANT5-NEXT: v_lshlrev_b64_e32 v[0:1], 2, v[0:1] +; VARIANT5-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0 ; VARIANT5-NEXT: s_delay_alu instid0(VALU_DEP_2) -; VARIANT5-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, s1, v2, vcc_lo -; VARIANT5-NEXT: global_load_b32 v0, v[1:2], off +; VARIANT5-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo +; VARIANT5-NEXT: global_load_b32 v0, v[0:1], off ; VARIANT5-NEXT: s_wait_loadcnt 0x0 ; VARIANT5-NEXT: global_store_b32 v3, v0, s[0:1] ; VARIANT5-NEXT: s_nop 0 @@ -136,23 +140,24 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 { ; ; VARIANT6-LABEL: test_barrier: ; VARIANT6: ; %bb.0: ; %entry -; VARIANT6-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 -; VARIANT6-NEXT: v_lshlrev_b32_e32 v5, 2, v0 +; VARIANT6-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; VARIANT6-NEXT: s_wait_kmcnt 0x0 +; VARIANT6-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_and_b32 v4, 0x3ff, v0 ; VARIANT6-NEXT: s_sub_co_i32 s2, s2, 1 -; VARIANT6-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0 -; VARIANT6-NEXT: v_sub_nc_u32_e32 v1, s2, v0 -; VARIANT6-NEXT: global_store_b32 v5, v0, s[0:1] +; VARIANT6-NEXT: s_delay_alu instid0(VALU_DEP_1) +; VARIANT6-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_lshlrev_b32 v5, 2, v4 +; VARIANT6-NEXT: v_sub_nc_u32_e32 v0, s2, v4 +; VARIANT6-NEXT: global_store_b32 v5, v4, s[0:1] +; VARIANT6-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VARIANT6-NEXT: s_wait_storecnt 0x0 ; VARIANT6-NEXT: s_barrier_signal -1 ; VARIANT6-NEXT: s_barrier_wait -1 -; VARIANT6-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; VARIANT6-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; VARIANT6-NEXT: v_lshlrev_b64_e32 v[1:2], 2, v[1:2] -; VARIANT6-NEXT: v_add_co_u32 v1, vcc_lo, v3, v1 +; VARIANT6-NEXT: v_lshlrev_b64_e32 v[0:1], 2, v[0:1] +; VARIANT6-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 ; VARIANT6-NEXT: s_delay_alu instid0(VALU_DEP_2) -; VARIANT6-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v4, v2, vcc_lo -; VARIANT6-NEXT: global_load_b32 v0, v[1:2], off +; VARIANT6-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo +; VARIANT6-NEXT: global_load_b32 v0, v[0:1], off ; VARIANT6-NEXT: s_wait_loadcnt 0x0 ; VARIANT6-NEXT: global_store_b32 v5, v0, s[0:1] ; VARIANT6-NEXT: s_nop 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll index 38a34ec6daf73..8bfe996c6a90a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll @@ -5,10 +5,11 @@ define amdgpu_kernel void @test1_s_barrier_signal(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test1_s_barrier_signal: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 +; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v3, v2, s[0:1] @@ -22,10 +23,11 @@ define amdgpu_kernel void @test1_s_barrier_signal(ptr addrspace(1) %out) #0 { ; ; GLOBAL-ISEL-LABEL: test1_s_barrier_signal: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] @@ -51,10 +53,11 @@ entry: define amdgpu_kernel void @test2_s_barrier_signal(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test2_s_barrier_signal: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 +; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v3, v2, s[0:1] @@ -68,10 +71,11 @@ define amdgpu_kernel void @test2_s_barrier_signal(ptr addrspace(1) %out) #0 { ; ; GLOBAL-ISEL-LABEL: test2_s_barrier_signal: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] @@ -97,10 +101,11 @@ entry: define amdgpu_kernel void @test3_s_barrier_signal(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test3_s_barrier_signal: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 +; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v3, v2, s[0:1] @@ -114,10 +119,11 @@ define amdgpu_kernel void @test3_s_barrier_signal(ptr addrspace(1) %out) #0 { ; ; GLOBAL-ISEL-LABEL: test3_s_barrier_signal: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] @@ -143,12 +149,12 @@ entry: define amdgpu_kernel void @test1_s_barrier_signal_var(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test1_s_barrier_signal_var: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GCN-NEXT: s_mov_b32 m0, 1 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GCN-NEXT: v_mul_u32_u24_e32 v2, v0, v0 -; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 2, v0 -; GCN-NEXT: s_mov_b32 m0, 1 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v3, v1, s[0:1] @@ -162,11 +168,13 @@ define amdgpu_kernel void @test1_s_barrier_signal_var(ptr addrspace(1) %out) #0 ; ; GLOBAL-ISEL-LABEL: test1_s_barrier_signal_var: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v2, 0 ; GLOBAL-ISEL-NEXT: s_mov_b32 m0, 1 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GLOBAL-ISEL-NEXT: v_lshlrev_b32_e32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] @@ -222,8 +230,10 @@ define void @test2_s_barrier_signal_var(i32 %arg) { define amdgpu_kernel void @test1_s_barrier_signal_isfirst(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 { ; GCN-LABEL: test1_s_barrier_signal_isfirst: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GCN-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v0, v1, s[6:7] ; GCN-NEXT: s_wait_storecnt 0x0 @@ -242,8 +252,10 @@ define amdgpu_kernel void @test1_s_barrier_signal_isfirst(ptr addrspace(1) %a, p ; ; GLOBAL-ISEL-LABEL: test1_s_barrier_signal_isfirst: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GLOBAL-ISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7] ; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0 @@ -278,8 +290,10 @@ entry: define amdgpu_kernel void @test2_s_barrier_signal_isfirst(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 { ; GCN-LABEL: test2_s_barrier_signal_isfirst: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GCN-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v0, v1, s[6:7] ; GCN-NEXT: s_wait_storecnt 0x0 @@ -298,8 +312,10 @@ define amdgpu_kernel void @test2_s_barrier_signal_isfirst(ptr addrspace(1) %a, p ; ; GLOBAL-ISEL-LABEL: test2_s_barrier_signal_isfirst: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GLOBAL-ISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7] ; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0 @@ -334,8 +350,10 @@ entry: define amdgpu_kernel void @test3_s_barrier_signal_isfirst(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 { ; GCN-LABEL: test3_s_barrier_signal_isfirst: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GCN-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v0, v1, s[6:7] ; GCN-NEXT: s_wait_storecnt 0x0 @@ -354,8 +372,10 @@ define amdgpu_kernel void @test3_s_barrier_signal_isfirst(ptr addrspace(1) %a, p ; ; GLOBAL-ISEL-LABEL: test3_s_barrier_signal_isfirst: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GLOBAL-ISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7] ; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0 @@ -390,9 +410,11 @@ entry: define amdgpu_kernel void @test1_s_barrier_signal_isfirst_var(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 { ; GCN-LABEL: test1_s_barrier_signal_isfirst_var: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GCN-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GCN-NEXT: s_mov_b32 m0, 1 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v0, v1, s[6:7] ; GCN-NEXT: s_wait_storecnt 0x0 @@ -411,9 +433,11 @@ define amdgpu_kernel void @test1_s_barrier_signal_isfirst_var(ptr addrspace(1) % ; ; GLOBAL-ISEL-LABEL: test1_s_barrier_signal_isfirst_var: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GLOBAL-ISEL-NEXT: s_mov_b32 m0, 1 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GLOBAL-ISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7] ; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0 @@ -518,10 +542,11 @@ define void @test2_s_barrier_signal_isfirst_var(ptr addrspace(1) %a, ptr addrspa define amdgpu_kernel void @test1_s_barrier_init(ptr addrspace(1) %out, i32 %mbrCnt) #0 { ; GCN-LABEL: test1_s_barrier_init: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 -; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 +; GCN-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_lshl_b32 s2, s2, 16 @@ -535,10 +560,11 @@ define amdgpu_kernel void @test1_s_barrier_init(ptr addrspace(1) %out, i32 %mbrC ; ; GLOBAL-ISEL-LABEL: test1_s_barrier_init: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 -; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: s_lshl_b32 m0, 16, s2 @@ -562,10 +588,11 @@ entry: define amdgpu_kernel void @test2_s_barrier_init(ptr addrspace(1) %out, i32 %mbrCnt) #0 { ; GCN-LABEL: test2_s_barrier_init: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 -; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 +; GCN-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_lshl_b32 s2, s2, 16 @@ -579,10 +606,11 @@ define amdgpu_kernel void @test2_s_barrier_init(ptr addrspace(1) %out, i32 %mbrC ; ; GLOBAL-ISEL-LABEL: test2_s_barrier_init: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 -; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: s_lshl_b32 m0, 16, s2 @@ -606,10 +634,11 @@ entry: define amdgpu_kernel void @test3_s_barrier_init(ptr addrspace(1) %out, i32 %mbrCnt) #0 { ; GCN-LABEL: test3_s_barrier_init: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 -; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 +; GCN-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_lshl_b32 s2, s2, 16 @@ -623,10 +652,11 @@ define amdgpu_kernel void @test3_s_barrier_init(ptr addrspace(1) %out, i32 %mbrC ; ; GLOBAL-ISEL-LABEL: test3_s_barrier_init: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 -; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: s_lshl_b32 m0, 16, s2 @@ -650,15 +680,17 @@ entry: define amdgpu_kernel void @test4_s_barrier_init(ptr addrspace(1) %out, i32 %bar, i32 %mbrCnt) #0 { ; GCN-LABEL: test4_s_barrier_init: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 +; GCN-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_lshl_b32 s3, s3, 16 ; GCN-NEXT: global_store_b32 v3, v2, s[0:1] ; GCN-NEXT: s_or_b32 s2, s2, s3 +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GCN-NEXT: s_mov_b32 m0, s2 ; GCN-NEXT: s_barrier_init m0 ; GCN-NEXT: global_store_b32 v3, v0, s[0:1] @@ -668,10 +700,11 @@ define amdgpu_kernel void @test4_s_barrier_init(ptr addrspace(1) %out, i32 %bar, ; ; GLOBAL-ISEL-LABEL: test4_s_barrier_init: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: s_lshl_b32 s3, 16, s3 @@ -732,11 +765,12 @@ define void @test5_s_barrier_init_m0(i32 %arg1 ,i32 %arg2) { define amdgpu_kernel void @test1_s_barrier_join(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test1_s_barrier_join: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GCN-NEXT: s_barrier_join -1 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GCN-NEXT: s_barrier_join -1 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v2, v0, s[0:1] @@ -746,10 +780,11 @@ define amdgpu_kernel void @test1_s_barrier_join(ptr addrspace(1) %out) #0 { ; ; GLOBAL-ISEL-LABEL: test1_s_barrier_join: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] @@ -772,11 +807,12 @@ entry: define amdgpu_kernel void @test2_s_barrier_join(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test2_s_barrier_join: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GCN-NEXT: s_barrier_join 1 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GCN-NEXT: s_barrier_join 1 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v2, v0, s[0:1] @@ -786,10 +822,11 @@ define amdgpu_kernel void @test2_s_barrier_join(ptr addrspace(1) %out) #0 { ; ; GLOBAL-ISEL-LABEL: test2_s_barrier_join: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] @@ -812,11 +849,12 @@ entry: define amdgpu_kernel void @test3_s_barrier_join(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test3_s_barrier_join: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GCN-NEXT: s_barrier_join 0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GCN-NEXT: s_barrier_join 0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v2, v0, s[0:1] @@ -826,10 +864,11 @@ define amdgpu_kernel void @test3_s_barrier_join(ptr addrspace(1) %out) #0 { ; ; GLOBAL-ISEL-LABEL: test3_s_barrier_join: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] @@ -852,11 +891,11 @@ entry: define amdgpu_kernel void @test4_s_barrier_join_m0(ptr addrspace(1) %out, i32 %bar) #0 { ; GCN-LABEL: test4_s_barrier_join_m0: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GCN-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GCN-NEXT: v_mul_u32_u24_e32 v2, v0, v0 -; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 2, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_mov_b32 m0, s2 @@ -869,10 +908,11 @@ define amdgpu_kernel void @test4_s_barrier_join_m0(ptr addrspace(1) %out, i32 %b ; ; GLOBAL-ISEL-LABEL: test4_s_barrier_join_m0: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 -; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: s_mov_b32 m0, s2 @@ -924,8 +964,10 @@ define void @test5_s_barrier_join_m0(i32 %arg) { define amdgpu_kernel void @test1_s_barrier_leave(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 { ; GCN-LABEL: test1_s_barrier_leave: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GCN-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v0, v1, s[6:7] ; GCN-NEXT: s_barrier_leave @@ -943,14 +985,16 @@ define amdgpu_kernel void @test1_s_barrier_leave(ptr addrspace(1) %a, ptr addrsp ; ; GLOBAL-ISEL-LABEL: test1_s_barrier_leave: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GLOBAL-ISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7] ; GLOBAL-ISEL-NEXT: s_barrier_leave ; GLOBAL-ISEL-NEXT: s_cselect_b32 s8, 1, 0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GLOBAL-ISEL-NEXT: s_and_b32 s8, s8, 1 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GLOBAL-ISEL-NEXT: s_cmp_lg_u32 s8, 0 ; GLOBAL-ISEL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] ; GLOBAL-ISEL-NEXT: s_clause 0x1 @@ -978,11 +1022,12 @@ entry: define amdgpu_kernel void @test1_s_wakeup_barrier(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test1_s_wakeup_barrier: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GCN-NEXT: s_wakeup_barrier -1 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GCN-NEXT: s_wakeup_barrier -1 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v2, v0, s[0:1] @@ -992,10 +1037,11 @@ define amdgpu_kernel void @test1_s_wakeup_barrier(ptr addrspace(1) %out) #0 { ; ; GLOBAL-ISEL-LABEL: test1_s_wakeup_barrier: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] @@ -1018,11 +1064,12 @@ entry: define amdgpu_kernel void @test2_s_wakeup_barrier(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test2_s_wakeup_barrier: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GCN-NEXT: s_wakeup_barrier 1 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GCN-NEXT: s_wakeup_barrier 1 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v2, v0, s[0:1] @@ -1032,10 +1079,11 @@ define amdgpu_kernel void @test2_s_wakeup_barrier(ptr addrspace(1) %out) #0 { ; ; GLOBAL-ISEL-LABEL: test2_s_wakeup_barrier: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] @@ -1058,11 +1106,12 @@ entry: define amdgpu_kernel void @test3_s_wakeup_barrier(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test3_s_wakeup_barrier: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GCN-NEXT: s_wakeup_barrier 0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GCN-NEXT: s_wakeup_barrier 0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v2, v0, s[0:1] @@ -1072,10 +1121,11 @@ define amdgpu_kernel void @test3_s_wakeup_barrier(ptr addrspace(1) %out) #0 { ; ; GLOBAL-ISEL-LABEL: test3_s_wakeup_barrier: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] @@ -1098,11 +1148,11 @@ entry: define amdgpu_kernel void @test4_s_wakeup_barrier_m0(ptr addrspace(1) %out, i32 %bar) #0 { ; GCN-LABEL: test4_s_wakeup_barrier_m0: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GCN-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GCN-NEXT: v_mul_u32_u24_e32 v2, v0, v0 -; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 2, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_mov_b32 m0, s2 @@ -1115,10 +1165,11 @@ define amdgpu_kernel void @test4_s_wakeup_barrier_m0(ptr addrspace(1) %out, i32 ; ; GLOBAL-ISEL-LABEL: test4_s_wakeup_barrier_m0: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 -; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: s_mov_b32 m0, s2 @@ -1170,11 +1221,12 @@ define void @test5_s_wakeup_barrier_m0(i32 %arg) { define amdgpu_kernel void @test1_s_get_barrier_state(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test1_s_get_barrier_state: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_get_barrier_state s2, -1 -; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_get_barrier_state s4, -1 +; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GCN-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_lshlrev_b32 v0, 2, v0 +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GCN-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: global_store_b32 v0, v1, s[0:1] ; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1182,13 +1234,14 @@ define amdgpu_kernel void @test1_s_get_barrier_state(ptr addrspace(1) %out) #0 { ; ; GLOBAL-ISEL-LABEL: test1_s_get_barrier_state: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_2) +; GLOBAL-ISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GLOBAL-ISEL-NEXT: s_get_barrier_state s2, -1 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2) ; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v1, s2 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GLOBAL-ISEL-NEXT: s_nop 0 @@ -1206,11 +1259,12 @@ entry: define amdgpu_kernel void @test2_s_get_barrier_state(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test2_s_get_barrier_state: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_get_barrier_state s2, 1 -; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_get_barrier_state s4, 1 +; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GCN-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_lshlrev_b32 v0, 2, v0 +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GCN-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: global_store_b32 v0, v1, s[0:1] ; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1218,13 +1272,14 @@ define amdgpu_kernel void @test2_s_get_barrier_state(ptr addrspace(1) %out) #0 { ; ; GLOBAL-ISEL-LABEL: test2_s_get_barrier_state: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_2) +; GLOBAL-ISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GLOBAL-ISEL-NEXT: s_get_barrier_state s2, 1 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2) ; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v1, s2 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GLOBAL-ISEL-NEXT: s_nop 0 @@ -1242,11 +1297,12 @@ entry: define amdgpu_kernel void @test3_s_get_barrier_state(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test3_s_get_barrier_state: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_get_barrier_state s2, 0 -; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_get_barrier_state s4, 0 +; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GCN-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_lshlrev_b32 v0, 2, v0 +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GCN-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: global_store_b32 v0, v1, s[0:1] ; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1254,13 +1310,14 @@ define amdgpu_kernel void @test3_s_get_barrier_state(ptr addrspace(1) %out) #0 { ; ; GLOBAL-ISEL-LABEL: test3_s_get_barrier_state: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_2) +; GLOBAL-ISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GLOBAL-ISEL-NEXT: s_get_barrier_state s2, 0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2) ; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v1, s2 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GLOBAL-ISEL-NEXT: s_nop 0 @@ -1278,8 +1335,10 @@ entry: define amdgpu_kernel void @test4_s_get_barrier_state_m0(ptr addrspace(1) %out, i32 %bar) #0 { ; GCN-LABEL: test4_s_get_barrier_state_m0: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 -; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GCN-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_mov_b32 m0, s2 ; GCN-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1294,8 +1353,10 @@ define amdgpu_kernel void @test4_s_get_barrier_state_m0(ptr addrspace(1) %out, i ; ; GLOBAL-ISEL-LABEL: test4_s_get_barrier_state_m0: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GLOBAL-ISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: s_mov_b32 m0, s2 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1352,10 +1413,11 @@ define i32 @test5_s_get_barrier_state_m0(i32 %arg) { define amdgpu_kernel void @test_barrier_convert(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test_barrier_convert: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 +; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v3, v2, s[0:1] @@ -1369,10 +1431,11 @@ define amdgpu_kernel void @test_barrier_convert(ptr addrspace(1) %out) #0 { ; ; GLOBAL-ISEL-LABEL: test_barrier_convert: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll index c2e74eb05d164..527627a5a2f67 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll @@ -5,10 +5,11 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_cluster(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_WMMA_cluster: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 5, v0 +; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GCN-NEXT: v_and_b32_e32 v40, 0x7fe0, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GCN-NEXT: v_add_nc_u32_e32 v32, s0, v40 ; GCN-NEXT: v_dual_mov_b32 v81, s1 :: v_dual_add_nc_u32 v80, s1, v40 ; GCN-NEXT: ds_load_b128 v[4:7], v32 offset:16 @@ -72,10 +73,11 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_cluster(ptr ad ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_WMMA_cluster: ; EXACTCUTOFF: ; %bb.0: ; %entry -; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v40, 5, v0 +; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; EXACTCUTOFF-NEXT: v_and_b32_e32 v40, 0x7fe0, v0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) ; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v32, s0, v40 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v81, s1 :: v_dual_add_nc_u32 v80, s1, v40 ; EXACTCUTOFF-NEXT: ds_load_b128 v[4:7], v32 offset:16 @@ -175,10 +177,11 @@ entry: define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_interleave(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_WMMA_interleave: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 5, v0 +; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GCN-NEXT: v_and_b32_e32 v16, 0x7fe0, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GCN-NEXT: v_add_nc_u32_e32 v17, s0, v16 ; GCN-NEXT: v_add_nc_u32_e32 v16, s1, v16 ; GCN-NEXT: ds_load_b128 v[4:7], v17 offset:16 @@ -256,10 +259,11 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_interleave(ptr ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_WMMA_interleave: ; EXACTCUTOFF: ; %bb.0: ; %entry -; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v16, 5, v0 +; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; EXACTCUTOFF-NEXT: v_and_b32_e32 v16, 0x7fe0, v0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) ; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v17, s0, v16 ; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v16, s1, v16 ; EXACTCUTOFF-NEXT: ds_load_b128 v[4:7], v17 offset:16 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll index fdcb1773d0a3f..a29e2298210a3 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll @@ -7,11 +7,12 @@ declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16..i16( define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_cluster(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_SWMMAC_cluster: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 4, v0 +; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GCN-NEXT: v_mov_b32_e32 v48, 0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GCN-NEXT: v_and_b32_e32 v28, 0x3ff0, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_add_nc_u32_e32 v0, s0, v28 ; GCN-NEXT: v_dual_mov_b32 v50, s1 :: v_dual_add_nc_u32 v49, s1, v28 ; GCN-NEXT: ds_load_b128 v[8:11], v0 @@ -58,11 +59,12 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_cluster(ptr ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_SWMMAC_cluster: ; EXACTCUTOFF: ; %bb.0: ; %entry -; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v28, 4, v0 +; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v48, 0 +; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; EXACTCUTOFF-NEXT: v_and_b32_e32 v28, 0x3ff0, v0 ; EXACTCUTOFF-NEXT: s_wait_kmcnt 0x0 -; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_2) ; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v0, s0, v28 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v50, s1 :: v_dual_add_nc_u32 v49, s1, v28 ; EXACTCUTOFF-NEXT: ds_load_b128 v[8:11], v0 @@ -147,127 +149,131 @@ entry: define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_interleaved(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_SWMMAC_interleaved: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GCN-NEXT: v_and_b32_e32 v16, 0x3ff, v0 ; GCN-NEXT: v_mov_b32_e32 v18, 0 ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: v_lshl_add_u32 v17, v0, 5, s0 -; GCN-NEXT: v_lshl_add_u32 v0, v0, 4, s1 -; GCN-NEXT: ds_load_b128 v[9:12], v17 offset:1024 -; GCN-NEXT: ds_load_b128 v[1:4], v17 -; GCN-NEXT: ds_load_b128 v[5:8], v17 offset:16 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GCN-NEXT: v_lshl_add_u32 v17, v16, 5, s0 +; GCN-NEXT: v_lshl_add_u32 v16, v16, 4, s1 +; GCN-NEXT: ds_load_b128 v[8:11], v17 offset:1024 +; GCN-NEXT: ds_load_b128 v[0:3], v17 +; GCN-NEXT: ds_load_b128 v[4:7], v17 offset:16 ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(3) SyncID(0) ; GCN-NEXT: s_wait_dscnt 0x2 -; GCN-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 -; GCN-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 +; GCN-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 +; GCN-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 ; GCN-NEXT: s_wait_dscnt 0x0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 +; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-NEXT: ds_store_b128 v0, v[13:16] -; GCN-NEXT: ds_load_b128 v[9:12], v17 offset:2560 -; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: ds_store_b128 v16, v[12:15] +; GCN-NEXT: ds_load_b128 v[8:11], v17 offset:2560 +; GCN-NEXT: v_mov_b32_e32 v16, s1 ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) ; GCN-NEXT: s_wait_dscnt 0x0 -; GCN-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 -; GCN-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 +; GCN-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 +; GCN-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 +; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-NEXT: ds_store_b128 v0, v[13:16] offset:512 -; GCN-NEXT: ds_load_b128 v[9:12], v17 offset:4608 +; GCN-NEXT: ds_store_b128 v16, v[12:15] offset:512 +; GCN-NEXT: ds_load_b128 v[8:11], v17 offset:4608 ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) ; GCN-NEXT: s_wait_dscnt 0x0 -; GCN-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 -; GCN-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 +; GCN-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 +; GCN-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 +; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-NEXT: ds_store_b128 v0, v[13:16] offset:1024 -; GCN-NEXT: ds_load_b128 v[9:12], v17 offset:7168 +; GCN-NEXT: ds_store_b128 v16, v[12:15] offset:1024 +; GCN-NEXT: ds_load_b128 v[8:11], v17 offset:7168 ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) ; GCN-NEXT: s_wait_dscnt 0x0 -; GCN-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 -; GCN-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 +; GCN-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 +; GCN-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 +; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-NEXT: ds_store_b128 v0, v[13:16] offset:1536 -; GCN-NEXT: ds_load_b128 v[9:12], v17 offset:10240 +; GCN-NEXT: ds_store_b128 v16, v[12:15] offset:1536 +; GCN-NEXT: ds_load_b128 v[8:11], v17 offset:10240 ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) ; GCN-NEXT: s_wait_dscnt 0x0 -; GCN-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 -; GCN-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 +; GCN-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 +; GCN-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 +; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-NEXT: ds_store_b128 v0, v[13:16] offset:2048 +; GCN-NEXT: ds_store_b128 v16, v[12:15] offset:2048 ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) ; GCN-NEXT: s_endpgm ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_SWMMAC_interleaved: ; EXACTCUTOFF: ; %bb.0: ; %entry -; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; EXACTCUTOFF-NEXT: v_and_b32_e32 v16, 0x3ff, v0 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v18, 0 ; EXACTCUTOFF-NEXT: s_wait_kmcnt 0x0 -; EXACTCUTOFF-NEXT: v_lshl_add_u32 v17, v0, 5, s0 -; EXACTCUTOFF-NEXT: v_lshl_add_u32 v0, v0, 4, s1 -; EXACTCUTOFF-NEXT: ds_load_b128 v[9:12], v17 offset:1024 -; EXACTCUTOFF-NEXT: ds_load_b128 v[1:4], v17 -; EXACTCUTOFF-NEXT: ds_load_b128 v[5:8], v17 offset:16 +; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_2) +; EXACTCUTOFF-NEXT: v_lshl_add_u32 v17, v16, 5, s0 +; EXACTCUTOFF-NEXT: v_lshl_add_u32 v16, v16, 4, s1 +; EXACTCUTOFF-NEXT: ds_load_b128 v[8:11], v17 offset:1024 +; EXACTCUTOFF-NEXT: ds_load_b128 v[0:3], v17 +; EXACTCUTOFF-NEXT: ds_load_b128 v[4:7], v17 offset:16 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(3) SyncID(0) ; EXACTCUTOFF-NEXT: s_wait_dscnt 0x2 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 ; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0 ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) -; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 +; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ds_store_b128 v0, v[13:16] -; EXACTCUTOFF-NEXT: ds_load_b128 v[9:12], v17 offset:2560 -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v0, s1 +; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[12:15] +; EXACTCUTOFF-NEXT: ds_load_b128 v[8:11], v17 offset:2560 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v16, s1 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) -; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 +; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ds_store_b128 v0, v[13:16] offset:512 -; EXACTCUTOFF-NEXT: ds_load_b128 v[9:12], v17 offset:4608 +; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[12:15] offset:512 +; EXACTCUTOFF-NEXT: ds_load_b128 v[8:11], v17 offset:4608 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) -; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 +; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ds_store_b128 v0, v[13:16] offset:1024 -; EXACTCUTOFF-NEXT: ds_load_b128 v[9:12], v17 offset:7168 +; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[12:15] offset:1024 +; EXACTCUTOFF-NEXT: ds_load_b128 v[8:11], v17 offset:7168 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) -; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 +; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ds_store_b128 v0, v[13:16] offset:1536 -; EXACTCUTOFF-NEXT: ds_load_b128 v[9:12], v17 offset:10240 +; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[12:15] offset:1536 +; EXACTCUTOFF-NEXT: ds_load_b128 v[8:11], v17 offset:10240 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) -; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 +; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ds_store_b128 v0, v[13:16] offset:2048 +; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[12:15] offset:2048 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll index ae5b62ffb285b..b8a4674833cee 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll @@ -29,7 +29,8 @@ entry: define amdgpu_kernel void @test_sched_group_barrier_pipeline_READ_VALU_WRITE(ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_READ_VALU_WRITE: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v32, 7, v0 ; GCN-NEXT: ; kill: killed $sgpr4_sgpr5 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -96,7 +97,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_READ_VALU_WRITE(ptr ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_READ_VALU_WRITE: ; EXACTCUTOFF: ; %bb.0: -; EXACTCUTOFF-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; EXACTCUTOFF-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v32, 7, v0 ; EXACTCUTOFF-NEXT: ; kill: killed $sgpr4_sgpr5 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) @@ -178,34 +180,27 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_READ_VALU_WRITE(ptr define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VALU(ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v32, 7, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx4 v[28:31], v32, s[4:5] offset:16 -; GCN-NEXT: global_load_dwordx4 v[8:11], v32, s[4:5] offset:96 -; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_lo_u32 v29, v29, v29 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_lo_u32 v9, v9, v9 ; GCN-NEXT: global_load_dwordx4 v[0:3], v32, s[4:5] -; GCN-NEXT: v_mul_lo_u32 v8, v8, v8 -; GCN-NEXT: v_mul_lo_u32 v28, v28, v28 -; GCN-NEXT: v_mul_lo_u32 v31, v31, v31 -; GCN-NEXT: v_mul_lo_u32 v30, v30, v30 +; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_lo_u32 v29, v29, v29 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_lo_u32 v3, v3, v3 ; GCN-NEXT: v_mul_lo_u32 v2, v2, v2 ; GCN-NEXT: global_load_dwordx4 v[4:7], v32, s[4:5] offset:112 ; GCN-NEXT: v_mul_lo_u32 v1, v1, v1 ; GCN-NEXT: v_mul_lo_u32 v0, v0, v0 -; GCN-NEXT: v_mul_lo_u32 v11, v11, v11 -; GCN-NEXT: v_mul_lo_u32 v10, v10, v10 -; GCN-NEXT: global_load_dwordx4 v[12:15], v32, s[4:5] offset:48 -; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; GCN-NEXT: global_load_dwordx4 v[8:11], v32, s[4:5] offset:96 +; GCN-NEXT: v_mul_lo_u32 v28, v28, v28 +; GCN-NEXT: v_mul_lo_u32 v31, v31, v31 +; GCN-NEXT: v_mul_lo_u32 v30, v30, v30 ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) @@ -213,24 +208,33 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_mul_lo_u32 v7, v7, v7 ; GCN-NEXT: v_mul_lo_u32 v6, v6, v6 +; GCN-NEXT: global_load_dwordx4 v[12:15], v32, s[4:5] offset:80 ; GCN-NEXT: v_mul_lo_u32 v5, v5, v5 ; GCN-NEXT: v_mul_lo_u32 v4, v4, v4 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_lo_u32 v13, v13, v13 -; GCN-NEXT: v_mul_lo_u32 v15, v15, v15 -; GCN-NEXT: global_load_dwordx4 v[16:19], v32, s[4:5] offset:80 -; GCN-NEXT: v_mul_lo_u32 v14, v14, v14 -; GCN-NEXT: v_mul_lo_u32 v12, v12, v12 -; GCN-NEXT: global_load_dwordx4 v[20:23], v32, s[4:5] offset:64 -; GCN-NEXT: global_load_dwordx4 v[24:27], v32, s[4:5] offset:32 +; GCN-NEXT: global_load_dwordx4 v[16:19], v32, s[4:5] offset:48 +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: v_mul_lo_u32 v11, v11, v11 +; GCN-NEXT: v_mul_lo_u32 v10, v10, v10 +; GCN-NEXT: v_mul_lo_u32 v9, v9, v9 +; GCN-NEXT: v_mul_lo_u32 v8, v8, v8 ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_lo_u32 v15, v15, v15 +; GCN-NEXT: v_mul_lo_u32 v14, v14, v14 +; GCN-NEXT: v_mul_lo_u32 v13, v13, v13 +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_lo_u32 v19, v19, v19 ; GCN-NEXT: v_mul_lo_u32 v18, v18, v18 +; GCN-NEXT: global_load_dwordx4 v[20:23], v32, s[4:5] offset:64 +; GCN-NEXT: global_load_dwordx4 v[24:27], v32, s[4:5] offset:32 ; GCN-NEXT: v_mul_lo_u32 v17, v17, v17 +; GCN-NEXT: v_mul_lo_u32 v16, v16, v16 +; GCN-NEXT: v_mul_lo_u32 v12, v12, v12 +; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_mul_lo_u32 v23, v23, v23 ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -241,12 +245,11 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; GCN-NEXT: v_mul_lo_u32 v22, v22, v22 ; GCN-NEXT: v_mul_lo_u32 v21, v21, v21 ; GCN-NEXT: v_mul_lo_u32 v20, v20, v20 -; GCN-NEXT: v_mul_lo_u32 v16, v16, v16 ; GCN-NEXT: global_store_dwordx4 v32, v[4:7], s[6:7] offset:112 ; GCN-NEXT: global_store_dwordx4 v32, v[8:11], s[6:7] offset:96 -; GCN-NEXT: global_store_dwordx4 v32, v[16:19], s[6:7] offset:80 +; GCN-NEXT: global_store_dwordx4 v32, v[12:15], s[6:7] offset:80 ; GCN-NEXT: global_store_dwordx4 v32, v[20:23], s[6:7] offset:64 -; GCN-NEXT: global_store_dwordx4 v32, v[12:15], s[6:7] offset:48 +; GCN-NEXT: global_store_dwordx4 v32, v[16:19], s[6:7] offset:48 ; GCN-NEXT: global_store_dwordx4 v32, v[24:27], s[6:7] offset:32 ; GCN-NEXT: global_store_dwordx4 v32, v[28:31], s[6:7] offset:16 ; GCN-NEXT: global_store_dwordx4 v32, v[0:3], s[6:7] @@ -258,34 +261,27 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU: ; EXACTCUTOFF: ; %bb.0: -; EXACTCUTOFF-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; EXACTCUTOFF-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v32, 7, v0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: global_load_dwordx4 v[28:31], v32, s[4:5] offset:16 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[8:11], v32, s[4:5] offset:96 -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(1) -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v29, v29, v29 -; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v9, v9, v9 ; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v32, s[4:5] -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v8, v8, v8 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v28, v28, v28 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v31, v31, v31 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v30, v30, v30 +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(1) +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v29, v29, v29 ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v3, v3, v3 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v2, v2, v2 ; EXACTCUTOFF-NEXT: global_load_dwordx4 v[4:7], v32, s[4:5] offset:112 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v1, v1, v1 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v0, v0, v0 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v11, v11, v11 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v10, v10, v10 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[12:15], v32, s[4:5] offset:48 -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[8:11], v32, s[4:5] offset:96 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v28, v28, v28 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v31, v31, v31 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v30, v30, v30 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) @@ -293,24 +289,33 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(1) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v7, v7, v7 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v6, v6, v6 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[12:15], v32, s[4:5] offset:80 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v5, v5, v5 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v4, v4, v4 -; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v13, v13, v13 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v15, v15, v15 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[16:19], v32, s[4:5] offset:80 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v14, v14, v14 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v12, v12, v12 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[20:23], v32, s[4:5] offset:64 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[24:27], v32, s[4:5] offset:32 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[16:19], v32, s[4:5] offset:48 +; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(2) +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v11, v11, v11 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v10, v10, v10 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v9, v9, v9 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v8, v8, v8 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(2) +; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(1) +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v15, v15, v15 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v14, v14, v14 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v13, v13, v13 +; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v19, v19, v19 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v18, v18, v18 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[20:23], v32, s[4:5] offset:64 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[24:27], v32, s[4:5] offset:32 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v17, v17, v17 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v16, v16, v16 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v12, v12, v12 +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(1) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v23, v23, v23 ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) @@ -321,12 +326,11 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v22, v22, v22 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v21, v21, v21 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v20, v20, v20 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v16, v16, v16 ; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[4:7], s[6:7] offset:112 ; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[8:11], s[6:7] offset:96 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[16:19], s[6:7] offset:80 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[12:15], s[6:7] offset:80 ; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[20:23], s[6:7] offset:64 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[12:15], s[6:7] offset:48 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[16:19], s[6:7] offset:48 ; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[24:27], s[6:7] offset:32 ; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[28:31], s[6:7] offset:16 ; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[0:3], s[6:7] @@ -381,23 +385,23 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VALU_WRITE(ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU_WRITE: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v16, 7, v0 ; GCN-NEXT: ; kill: killed $sgpr4_sgpr5 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx4 v[12:15], v16, s[4:5] offset:32 -; GCN-NEXT: global_load_dwordx4 v[4:7], v16, s[4:5] offset:48 ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_lo_u32 v13, v13, v13 +; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_lo_u32 v7, v7, v7 -; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[4:5] -; GCN-NEXT: v_mul_lo_u32 v6, v6, v6 +; GCN-NEXT: v_mul_lo_u32 v13, v13, v13 ; GCN-NEXT: v_mul_lo_u32 v12, v12, v12 ; GCN-NEXT: v_mul_lo_u32 v15, v15, v15 ; GCN-NEXT: v_mul_lo_u32 v14, v14, v14 -; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:32 +; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[4:5] +; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_lo_u32 v3, v3, v3 ; GCN-NEXT: v_mul_lo_u32 v2, v2, v2 @@ -405,6 +409,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; GCN-NEXT: v_mul_lo_u32 v0, v0, v0 ; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] ; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[4:5] offset:112 +; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_lo_u32 v3, v3, v3 ; GCN-NEXT: v_mul_lo_u32 v2, v2, v2 @@ -418,10 +423,17 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; GCN-NEXT: v_mul_lo_u32 v1, v1, v1 ; GCN-NEXT: v_mul_lo_u32 v0, v0, v0 ; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] offset:96 -; GCN-NEXT: v_mul_lo_u32 v5, v5, v5 -; GCN-NEXT: v_mul_lo_u32 v4, v4, v4 -; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:48 -; GCN-NEXT: global_load_dwordx4 v[4:7], v16, s[4:5] offset:64 +; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[4:5] offset:80 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_lo_u32 v3, v3, v3 +; GCN-NEXT: v_mul_lo_u32 v2, v2, v2 +; GCN-NEXT: v_mul_lo_u32 v1, v1, v1 +; GCN-NEXT: v_mul_lo_u32 v0, v0, v0 +; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] offset:80 +; GCN-NEXT: global_load_dwordx4 v[4:7], v16, s[4:5] offset:48 +; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) @@ -430,8 +442,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; GCN-NEXT: v_mul_lo_u32 v6, v6, v6 ; GCN-NEXT: v_mul_lo_u32 v5, v5, v5 ; GCN-NEXT: v_mul_lo_u32 v4, v4, v4 -; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:64 -; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:32 +; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:48 ; GCN-NEXT: global_load_dwordx4 v[8:11], v16, s[4:5] offset:16 ; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) @@ -439,53 +450,47 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) -; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) -; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) -; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_lo_u32 v9, v9, v9 ; GCN-NEXT: v_mul_lo_u32 v8, v8, v8 ; GCN-NEXT: v_mul_lo_u32 v11, v11, v11 ; GCN-NEXT: v_mul_lo_u32 v10, v10, v10 ; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:16 -; GCN-NEXT: global_load_dwordx4 v[8:11], v16, s[4:5] offset:80 +; GCN-NEXT: global_load_dwordx4 v[8:11], v16, s[4:5] offset:64 +; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_lo_u32 v11, v11, v11 ; GCN-NEXT: v_mul_lo_u32 v10, v10, v10 ; GCN-NEXT: v_mul_lo_u32 v9, v9, v9 ; GCN-NEXT: v_mul_lo_u32 v8, v8, v8 -; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:80 -; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) -; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:64 ; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; GCN-NEXT: s_endpgm ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU_WRITE: ; EXACTCUTOFF: ; %bb.0: -; EXACTCUTOFF-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; EXACTCUTOFF-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v16, 7, v0 ; EXACTCUTOFF-NEXT: ; kill: killed $sgpr4_sgpr5 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: global_load_dwordx4 v[12:15], v16, s[4:5] offset:32 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[4:7], v16, s[4:5] offset:48 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(1) -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v13, v13, v13 +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v7, v7, v7 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v16, s[4:5] -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v6, v6, v6 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v13, v13, v13 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v12, v12, v12 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v15, v15, v15 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v14, v14, v14 -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:32 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v16, s[4:5] +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v3, v3, v3 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v2, v2, v2 @@ -493,6 +498,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v0, v0, v0 ; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] ; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v16, s[4:5] offset:112 +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v3, v3, v3 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v2, v2, v2 @@ -506,10 +512,17 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v1, v1, v1 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v0, v0, v0 ; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] offset:96 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v5, v5, v5 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v4, v4, v4 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:48 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[4:7], v16, s[4:5] offset:64 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v16, s[4:5] offset:80 +; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v3, v3, v3 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v2, v2, v2 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v1, v1, v1 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v0, v0, v0 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] offset:80 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[4:7], v16, s[4:5] offset:48 +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) @@ -518,8 +531,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v6, v6, v6 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v5, v5, v5 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v4, v4, v4 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:64 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:32 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:48 ; EXACTCUTOFF-NEXT: global_load_dwordx4 v[8:11], v16, s[4:5] offset:16 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) @@ -527,31 +539,25 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v9, v9, v9 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v8, v8, v8 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v11, v11, v11 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v10, v10, v10 ; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:16 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[8:11], v16, s[4:5] offset:80 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[8:11], v16, s[4:5] offset:64 +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v11, v11, v11 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v10, v10, v10 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v9, v9, v9 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v8, v8, v8 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:80 -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:64 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #2 @@ -614,10 +620,11 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_MFMA_cluster: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GCN-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_add_u32_e32 v1, s2, v0 +; GCN-NEXT: v_add_u32_e32 v1, s0, v0 ; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:112 ; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:96 ; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:80 @@ -661,7 +668,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad ; GCN-NEXT: ds_read_b128 a[136:139], v2 offset:57376 ; GCN-NEXT: ds_read_b128 a[140:143], v2 offset:57392 ; GCN-NEXT: v_mov_b32_e32 v2, 2.0 -; GCN-NEXT: v_add_u32_e32 v0, s3, v0 +; GCN-NEXT: v_add_u32_e32 v0, s1, v0 ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(40) SyncID(0) ; GCN-NEXT: s_waitcnt lgkmcnt(14) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] @@ -681,7 +688,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad ; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32 ; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:16 ; GCN-NEXT: ds_write_b128 v0, a[0:3] -; GCN-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:8288 ; GCN-NEXT: ds_write_b128 v0, a[60:63] offset:8304 ; GCN-NEXT: ds_write_b128 v0, a[48:51] offset:8256 @@ -720,10 +727,11 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_MFMA_cluster: ; EXACTCUTOFF: ; %bb.0: ; %entry -; EXACTCUTOFF-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; EXACTCUTOFF-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s2, v0 +; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s0, v0 ; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:112 ; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:96 ; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:80 @@ -767,7 +775,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad ; EXACTCUTOFF-NEXT: ds_read_b128 a[136:139], v2 offset:57376 ; EXACTCUTOFF-NEXT: ds_read_b128 a[140:143], v2 offset:57392 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v2, 2.0 -; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s3, v0 +; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s1, v0 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(40) SyncID(0) ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(14) ; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] @@ -787,7 +795,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:32 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:16 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v0, s3 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v0, s1 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[56:59] offset:8288 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[60:63] offset:8304 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[48:51] offset:8256 @@ -862,12 +870,13 @@ entry: define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 7, v0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0x1ff80, v0 ; GCN-NEXT: v_mov_b32_e32 v2, 1.0 ; GCN-NEXT: v_mov_b32_e32 v3, 2.0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_add_u32_e32 v0, s2, v1 +; GCN-NEXT: v_add_u32_e32 v0, s0, v1 ; GCN-NEXT: ds_read_b128 a[28:31], v0 offset:112 ; GCN-NEXT: ds_read_b128 a[24:27], v0 offset:96 ; GCN-NEXT: ds_read_b128 a[20:23], v0 offset:80 @@ -878,7 +887,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-NEXT: ds_read_b128 a[12:15], v0 offset:48 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] -; GCN-NEXT: v_add_u32_e32 v1, s3, v1 +; GCN-NEXT: v_add_u32_e32 v1, s1, v1 ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-NEXT: s_nop 7 @@ -902,7 +911,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-NEXT: ds_read_b128 a[0:3], v0 offset:8192 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] -; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) @@ -995,12 +1004,13 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave: ; EXACTCUTOFF: ; %bb.0: ; %entry -; EXACTCUTOFF-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v1, 7, v0 +; EXACTCUTOFF-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; EXACTCUTOFF-NEXT: v_and_b32_e32 v1, 0x1ff80, v0 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v2, 1.0 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v3, 2.0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s2, v1 +; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s0, v1 ; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v0 offset:112 ; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v0 offset:96 ; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v0 offset:80 @@ -1011,7 +1021,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v0 offset:48 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] -; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s3, v1 +; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s1, v1 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_nop 7 @@ -1035,7 +1045,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v0 offset:8192 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v1, s3 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v1, s1 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) @@ -1188,9 +1198,9 @@ entry: define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out, <5 x float> %in1) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_interleave_EXP_MFMA: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x44 ; GCN-NEXT: v_mov_b32_e32 v3, 0x3fb8aa3b -; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: v_mov_b32_e32 v7, 0x32a5705f ; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -1202,7 +1212,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; GCN-NEXT: v_add_f32_e32 v4, v6, v4 ; GCN-NEXT: v_exp_f32_e32 v4, v4 ; GCN-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GCN-NEXT: v_add_u32_e32 v1, s2, v0 +; GCN-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 +; GCN-NEXT: v_add_u32_e32 v1, s0, v0 ; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:112 ; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:96 ; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:80 @@ -1277,7 +1288,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; GCN-NEXT: v_mul_f32_e32 v4, s7, v3 ; GCN-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc ; GCN-NEXT: v_rndne_f32_e32 v10, v4 -; GCN-NEXT: s_load_dword s8, s[0:1], 0x54 +; GCN-NEXT: s_load_dword s8, s[2:3], 0x54 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v9, v1, a[64:95] ; GCN-NEXT: v_sub_f32_e32 v1, v4, v10 @@ -1313,7 +1324,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s8, v6 ; GCN-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; GCN-NEXT: v_add_u32_e32 v0, s3, v0 +; GCN-NEXT: v_add_u32_e32 v0, s1, v0 ; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:112 ; GCN-NEXT: s_waitcnt lgkmcnt(1) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v9, v1, a[128:159] @@ -1324,8 +1335,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32 ; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:16 ; GCN-NEXT: ds_write_b128 v0, a[0:3] -; GCN-NEXT: v_mov_b32_e32 v0, s3 -; GCN-NEXT: ; kill: killed $sgpr0_sgpr1 +; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: ; kill: killed $sgpr2_sgpr3 ; GCN-NEXT: ; sched_group_barrier mask(0x00000400) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000400) size(1) SyncID(0) @@ -1372,9 +1383,9 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_interleave_EXP_MFMA: ; EXACTCUTOFF: ; %bb.0: ; %entry -; EXACTCUTOFF-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 +; EXACTCUTOFF-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x44 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v3, 0x3fb8aa3b -; EXACTCUTOFF-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; EXACTCUTOFF-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v7, 0x32a5705f ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) @@ -1386,7 +1397,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; EXACTCUTOFF-NEXT: v_add_f32_e32 v4, v6, v4 ; EXACTCUTOFF-NEXT: v_exp_f32_e32 v4, v4 ; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v5, v5 -; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s2, v0 +; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 +; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s0, v0 ; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:112 ; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:96 ; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:80 @@ -1461,7 +1473,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; EXACTCUTOFF-NEXT: v_mul_f32_e32 v4, s7, v3 ; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc ; EXACTCUTOFF-NEXT: v_rndne_f32_e32 v10, v4 -; EXACTCUTOFF-NEXT: s_load_dword s8, s[0:1], 0x54 +; EXACTCUTOFF-NEXT: s_load_dword s8, s[2:3], 0x54 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v9, v1, a[64:95] ; EXACTCUTOFF-NEXT: v_sub_f32_e32 v1, v4, v10 @@ -1497,7 +1509,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; EXACTCUTOFF-NEXT: v_cmp_ngt_f32_e32 vcc, s8, v6 ; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s3, v0 +; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s1, v0 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:112 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(1) ; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v9, v1, a[128:159] @@ -1508,8 +1520,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:32 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:16 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v0, s3 -; EXACTCUTOFF-NEXT: ; kill: killed $sgpr0_sgpr1 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v0, s1 +; EXACTCUTOFF-NEXT: ; kill: killed $sgpr2_sgpr3 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000400) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000400) size(1) SyncID(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll index fc33206845a71..114d2d099ab7b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll @@ -5,8 +5,8 @@ define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) { ; GCN-LABEL: set_inactive: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -24,7 +24,7 @@ define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) { define amdgpu_kernel void @set_inactive_imm_poison(ptr addrspace(1) %out) { ; GCN-LABEL: set_inactive_imm_poison: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v0, 1 @@ -39,7 +39,7 @@ define amdgpu_kernel void @set_inactive_imm_poison(ptr addrspace(1) %out) { define amdgpu_kernel void @set_inactive_64(ptr addrspace(1) %out, i64 %in) { ; GCN-LABEL: set_inactive_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -61,7 +61,7 @@ define amdgpu_kernel void @set_inactive_64(ptr addrspace(1) %out, i64 %in) { define amdgpu_kernel void @set_inactive_imm_poison_64(ptr addrspace(1) %out) { ; GCN-LABEL: set_inactive_imm_poison_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 @@ -77,17 +77,17 @@ define amdgpu_kernel void @set_inactive_imm_poison_64(ptr addrspace(1) %out) { define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x i32> inreg %desc) { ; GCN-LABEL: set_inactive_scc: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GCN-NEXT: s_load_dword s8, s[2:3], 0x2c ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_buffer_load_dword s3, s[4:7], 0x0 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: s_buffer_load_dword s4, s[4:7], 0x0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 42 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s3, 56 +; GCN-NEXT: s_cmp_lg_u32 s4, 56 ; GCN-NEXT: s_mov_b64 s[2:3], -1 ; GCN-NEXT: s_cbranch_scc1 .LBB4_3 ; GCN-NEXT: ; %bb.1: ; %Flow @@ -127,8 +127,8 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) { ; GCN-LABEL: set_inactive_f32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_mov_b32 s5, 0x40400000 @@ -147,7 +147,7 @@ define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) { ; GCN-LABEL: set_inactive_f64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -171,8 +171,8 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) { define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %in) { ; GCN-LABEL: set_inactive_v2i16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_mov_b32 s5, 0x10001 @@ -191,8 +191,8 @@ define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> % define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> %in) { ; GCN-LABEL: set_inactive_v2f16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_mov_b32 s5, 0x3c003c00 @@ -211,7 +211,7 @@ define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> %in) { ; GCN-LABEL: set_inactive_v2i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: s_mov_b32 s8, 1 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 @@ -235,7 +235,7 @@ define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> % define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; GCN-LABEL: set_inactive_v2f32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: s_mov_b32 s8, 1.0 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 @@ -259,8 +259,8 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float> define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in) { ; GCN-LABEL: set_inactive_v2bf16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_mov_b32 s5, 0x3f803f80 @@ -279,7 +279,7 @@ define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloa define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> %in) { ; GCN-LABEL: set_inactive_v4i16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: s_mov_b32 s8, 0x10001 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 @@ -303,7 +303,7 @@ define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> % define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half> %in) { ; GCN-LABEL: set_inactive_v4f16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: s_mov_b32 s8, 0x3c003c00 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 @@ -327,7 +327,7 @@ define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half> define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloat> %in) { ; GCN-LABEL: set_inactive_v4bf16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: s_mov_b32 s8, 0x3f803f80 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 @@ -351,7 +351,7 @@ define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloa define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) { ; GCN-LABEL: set_inactive_p0: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -373,8 +373,8 @@ define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) { define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(2) %in) { ; GCN-LABEL: set_inactive_p2: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -392,8 +392,8 @@ define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(3) %in) { ; GCN-LABEL: set_inactive_p3: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -411,8 +411,8 @@ define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(5) %in) { ; GCN-LABEL: set_inactive_p5: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -430,8 +430,8 @@ define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @set_inactive_p6(ptr addrspace(1) %out, ptr addrspace(6) %in) { ; GCN-LABEL: set_inactive_p6: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll index 0755dcddd8f46..279a64adfbda1 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @bfe_u32_arg_arg_arg(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 { ; SI-LABEL: bfe_u32_arg_arg_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -18,7 +18,7 @@ define amdgpu_kernel void @bfe_u32_arg_arg_arg(ptr addrspace(1) %out, i32 %src0, ; ; VI-LABEL: bfe_u32_arg_arg_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -36,7 +36,7 @@ define amdgpu_kernel void @bfe_u32_arg_arg_arg(ptr addrspace(1) %out, i32 %src0, define amdgpu_kernel void @bfe_u32_arg_arg_imm(ptr addrspace(1) %out, i32 %src0, i32 %src1) #0 { ; SI-LABEL: bfe_u32_arg_arg_imm: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x7b @@ -50,7 +50,7 @@ define amdgpu_kernel void @bfe_u32_arg_arg_imm(ptr addrspace(1) %out, i32 %src0, ; ; VI-LABEL: bfe_u32_arg_arg_imm: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v1, 0x7b ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -69,7 +69,7 @@ define amdgpu_kernel void @bfe_u32_arg_arg_imm(ptr addrspace(1) %out, i32 %src0, define amdgpu_kernel void @bfe_u32_arg_imm_arg(ptr addrspace(1) %out, i32 %src0, i32 %src2) #0 { ; SI-LABEL: bfe_u32_arg_imm_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x7b @@ -83,7 +83,7 @@ define amdgpu_kernel void @bfe_u32_arg_imm_arg(ptr addrspace(1) %out, i32 %src0, ; ; VI-LABEL: bfe_u32_arg_imm_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x7b ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -102,7 +102,7 @@ define amdgpu_kernel void @bfe_u32_arg_imm_arg(ptr addrspace(1) %out, i32 %src0, define amdgpu_kernel void @bfe_u32_imm_arg_arg(ptr addrspace(1) %out, i32 %src1, i32 %src2) #0 { ; SI-LABEL: bfe_u32_imm_arg_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_movk_i32 s8, 0x7b @@ -117,7 +117,7 @@ define amdgpu_kernel void @bfe_u32_imm_arg_arg(ptr addrspace(1) %out, i32 %src1, ; ; VI-LABEL: bfe_u32_imm_arg_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_movk_i32 s8, 0x7b ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -137,7 +137,7 @@ define amdgpu_kernel void @bfe_u32_imm_arg_arg(ptr addrspace(1) %out, i32 %src1, define amdgpu_kernel void @bfe_u32_arg_0_width_reg_offset(ptr addrspace(1) %out, i32 %src0, i32 %src1) #0 { ; SI-LABEL: bfe_u32_arg_0_width_reg_offset: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -147,7 +147,7 @@ define amdgpu_kernel void @bfe_u32_arg_0_width_reg_offset(ptr addrspace(1) %out, ; ; VI-LABEL: bfe_u32_arg_0_width_reg_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -162,7 +162,7 @@ define amdgpu_kernel void @bfe_u32_arg_0_width_reg_offset(ptr addrspace(1) %out, define amdgpu_kernel void @bfe_u32_arg_0_width_imm_offset(ptr addrspace(1) %out, i32 %src0, i32 %src1) #0 { ; SI-LABEL: bfe_u32_arg_0_width_imm_offset: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -172,7 +172,7 @@ define amdgpu_kernel void @bfe_u32_arg_0_width_imm_offset(ptr addrspace(1) %out, ; ; VI-LABEL: bfe_u32_arg_0_width_imm_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -187,7 +187,7 @@ define amdgpu_kernel void @bfe_u32_arg_0_width_imm_offset(ptr addrspace(1) %out, define amdgpu_kernel void @bfe_u32_zextload_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_zextload_i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -204,7 +204,7 @@ define amdgpu_kernel void @bfe_u32_zextload_i8(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: bfe_u32_zextload_i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -229,7 +229,7 @@ define amdgpu_kernel void @bfe_u32_zextload_i8(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @bfe_u32_zext_in_reg_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_zext_in_reg_i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -248,7 +248,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8(ptr addrspace(1) %out, ptr add ; ; VI-LABEL: bfe_u32_zext_in_reg_i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -275,7 +275,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @bfe_u32_zext_in_reg_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_zext_in_reg_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -294,7 +294,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i16(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: bfe_u32_zext_in_reg_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -321,7 +321,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i16(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_zext_in_reg_i8_offset_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -341,7 +341,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_1(ptr addrspace(1) %out ; ; VI-LABEL: bfe_u32_zext_in_reg_i8_offset_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -369,7 +369,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_1(ptr addrspace(1) %out define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_3(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_zext_in_reg_i8_offset_3: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -389,7 +389,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_3(ptr addrspace(1) %out ; ; VI-LABEL: bfe_u32_zext_in_reg_i8_offset_3: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -417,7 +417,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_3(ptr addrspace(1) %out define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_7(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_zext_in_reg_i8_offset_7: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -437,7 +437,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_7(ptr addrspace(1) %out ; ; VI-LABEL: bfe_u32_zext_in_reg_i8_offset_7: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -465,7 +465,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_7(ptr addrspace(1) %out define amdgpu_kernel void @bfe_u32_zext_in_reg_i16_offset_8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_zext_in_reg_i16_offset_8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -484,7 +484,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i16_offset_8(ptr addrspace(1) %ou ; ; VI-LABEL: bfe_u32_zext_in_reg_i16_offset_8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -511,7 +511,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i16_offset_8(ptr addrspace(1) %ou define amdgpu_kernel void @bfe_u32_test_1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -529,7 +529,7 @@ define amdgpu_kernel void @bfe_u32_test_1(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: bfe_u32_test_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -553,7 +553,7 @@ define amdgpu_kernel void @bfe_u32_test_1(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -563,7 +563,7 @@ define amdgpu_kernel void @bfe_u32_test_2(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: bfe_u32_test_2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -580,7 +580,7 @@ define amdgpu_kernel void @bfe_u32_test_2(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_3(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_3: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -590,7 +590,7 @@ define amdgpu_kernel void @bfe_u32_test_3(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: bfe_u32_test_3: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -607,7 +607,7 @@ define amdgpu_kernel void @bfe_u32_test_3(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_4(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_4: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -617,7 +617,7 @@ define amdgpu_kernel void @bfe_u32_test_4(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: bfe_u32_test_4: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -635,7 +635,7 @@ define amdgpu_kernel void @bfe_u32_test_4(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_5(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_5: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -653,7 +653,7 @@ define amdgpu_kernel void @bfe_u32_test_5(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: bfe_u32_test_5: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -679,7 +679,7 @@ define amdgpu_kernel void @bfe_u32_test_5(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_6(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_6: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -698,7 +698,7 @@ define amdgpu_kernel void @bfe_u32_test_6(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: bfe_u32_test_6: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -724,7 +724,7 @@ define amdgpu_kernel void @bfe_u32_test_6(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_7(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_7: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -742,7 +742,7 @@ define amdgpu_kernel void @bfe_u32_test_7(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: bfe_u32_test_7: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -767,7 +767,7 @@ define amdgpu_kernel void @bfe_u32_test_7(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -785,7 +785,7 @@ define amdgpu_kernel void @bfe_u32_test_8(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: bfe_u32_test_8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -810,7 +810,7 @@ define amdgpu_kernel void @bfe_u32_test_8(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_9(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_9: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -828,7 +828,7 @@ define amdgpu_kernel void @bfe_u32_test_9(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: bfe_u32_test_9: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -852,7 +852,7 @@ define amdgpu_kernel void @bfe_u32_test_9(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_10(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_10: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -870,7 +870,7 @@ define amdgpu_kernel void @bfe_u32_test_10(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: bfe_u32_test_10: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -894,7 +894,7 @@ define amdgpu_kernel void @bfe_u32_test_10(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_u32_test_11(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_11: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -912,7 +912,7 @@ define amdgpu_kernel void @bfe_u32_test_11(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: bfe_u32_test_11: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -936,7 +936,7 @@ define amdgpu_kernel void @bfe_u32_test_11(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_u32_test_12(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_12: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -954,7 +954,7 @@ define amdgpu_kernel void @bfe_u32_test_12(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: bfe_u32_test_12: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -979,7 +979,7 @@ define amdgpu_kernel void @bfe_u32_test_12(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_u32_test_13(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_13: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -997,7 +997,7 @@ define amdgpu_kernel void @bfe_u32_test_13(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: bfe_u32_test_13: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1021,7 +1021,7 @@ define amdgpu_kernel void @bfe_u32_test_13(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_u32_test_14(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_14: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -1031,7 +1031,7 @@ define amdgpu_kernel void @bfe_u32_test_14(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: bfe_u32_test_14: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -1047,7 +1047,7 @@ define amdgpu_kernel void @bfe_u32_test_14(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_u32_constant_fold_test_0(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1057,7 +1057,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_0(ptr addrspace(1) %out) # ; ; VI-LABEL: bfe_u32_constant_fold_test_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1072,7 +1072,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_0(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_1(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1082,7 +1082,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_1(ptr addrspace(1) %out) # ; ; VI-LABEL: bfe_u32_constant_fold_test_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1097,7 +1097,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_1(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_2(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1107,7 +1107,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_2(ptr addrspace(1) %out) # ; ; VI-LABEL: bfe_u32_constant_fold_test_2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1122,7 +1122,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_2(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_3(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_3: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 1 @@ -1132,7 +1132,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_3(ptr addrspace(1) %out) # ; ; VI-LABEL: bfe_u32_constant_fold_test_3: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 1 @@ -1147,7 +1147,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_3(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_4(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_4: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, -1 @@ -1157,7 +1157,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_4(ptr addrspace(1) %out) # ; ; VI-LABEL: bfe_u32_constant_fold_test_4: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, -1 @@ -1172,7 +1172,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_4(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_5(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_5: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 1 @@ -1182,7 +1182,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_5(ptr addrspace(1) %out) # ; ; VI-LABEL: bfe_u32_constant_fold_test_5: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 1 @@ -1197,7 +1197,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_5(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_6(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_6: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x80 @@ -1207,7 +1207,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_6(ptr addrspace(1) %out) # ; ; VI-LABEL: bfe_u32_constant_fold_test_6: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0x80 @@ -1222,7 +1222,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_6(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_7(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_7: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x7f @@ -1232,7 +1232,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_7(ptr addrspace(1) %out) # ; ; VI-LABEL: bfe_u32_constant_fold_test_7: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0x7f @@ -1247,7 +1247,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_7(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_8(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 1 @@ -1257,7 +1257,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_8(ptr addrspace(1) %out) # ; ; VI-LABEL: bfe_u32_constant_fold_test_8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 1 @@ -1272,7 +1272,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_8(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_9(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_9: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 1 @@ -1282,7 +1282,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_9(ptr addrspace(1) %out) # ; ; VI-LABEL: bfe_u32_constant_fold_test_9: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 1 @@ -1297,7 +1297,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_9(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_10(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_10: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1307,7 +1307,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_10(ptr addrspace(1) %out) ; ; VI-LABEL: bfe_u32_constant_fold_test_10: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1322,7 +1322,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_10(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_11(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_11: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 10 @@ -1332,7 +1332,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_11(ptr addrspace(1) %out) ; ; VI-LABEL: bfe_u32_constant_fold_test_11: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 10 @@ -1347,7 +1347,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_11(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_12(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_12: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1357,7 +1357,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_12(ptr addrspace(1) %out) ; ; VI-LABEL: bfe_u32_constant_fold_test_12: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1372,7 +1372,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_12(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_13(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_13: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 1 @@ -1382,7 +1382,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_13(ptr addrspace(1) %out) ; ; VI-LABEL: bfe_u32_constant_fold_test_13: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 1 @@ -1397,7 +1397,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_13(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_14(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_14: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 40 @@ -1407,7 +1407,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_14(ptr addrspace(1) %out) ; ; VI-LABEL: bfe_u32_constant_fold_test_14: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 40 @@ -1422,7 +1422,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_14(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_15(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_15: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 10 @@ -1432,7 +1432,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_15(ptr addrspace(1) %out) ; ; VI-LABEL: bfe_u32_constant_fold_test_15: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 10 @@ -1447,7 +1447,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_15(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_16(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x7f @@ -1457,7 +1457,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_16(ptr addrspace(1) %out) ; ; VI-LABEL: bfe_u32_constant_fold_test_16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0x7f @@ -1472,7 +1472,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_16(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_17(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_17: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x7f @@ -1482,7 +1482,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_17(ptr addrspace(1) %out) ; ; VI-LABEL: bfe_u32_constant_fold_test_17: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0x7f @@ -1497,7 +1497,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_17(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_18(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_18: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1507,7 +1507,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_18(ptr addrspace(1) %out) ; ; VI-LABEL: bfe_u32_constant_fold_test_18: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1526,47 +1526,45 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_18(ptr addrspace(1) %out) define amdgpu_kernel void @simplify_bfe_u32_multi_use_arg(ptr addrspace(1) %out0, ; SI-LABEL: simplify_bfe_u32_multi_use_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s2, s6 +; SI-NEXT: s_mov_b32 s3, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 -; SI-NEXT: s_mov_b32 s0, s8 -; SI-NEXT: s_mov_b32 s1, s9 -; SI-NEXT: s_mov_b32 s4, s10 -; SI-NEXT: s_mov_b32 s5, s11 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s4, s8 +; SI-NEXT: s_mov_b32 s5, s9 +; SI-NEXT: s_mov_b32 s0, s10 +; SI-NEXT: s_mov_b32 s1, s11 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v0, 63, v0 ; SI-NEXT: v_bfe_u32 v1, v0, 2, 2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: simplify_bfe_u32_multi_use_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v0, 63, v0 ; VI-NEXT: v_bfe_u32 v1, v0, 2, 2 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0 -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v1, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ptr addrspace(1) %out1, ptr addrspace(1) %in) #0 { @@ -1581,11 +1579,11 @@ define amdgpu_kernel void @simplify_bfe_u32_multi_use_arg(ptr addrspace(1) %out0 define amdgpu_kernel void @lshr_and(ptr addrspace(1) %out, i32 %a) #0 { ; SI-LABEL: lshr_and: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_u32 s4, s2, 0x30006 +; SI-NEXT: s_bfe_u32 s4, s4, 0x30006 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -1593,8 +1591,8 @@ define amdgpu_kernel void @lshr_and(ptr addrspace(1) %out, i32 %a) #0 { ; ; VI-LABEL: lshr_and: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1611,7 +1609,7 @@ define amdgpu_kernel void @lshr_and(ptr addrspace(1) %out, i32 %a) #0 { define amdgpu_kernel void @v_lshr_and(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { ; SI-LABEL: v_lshr_and: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshr_b32 s2, s2, s3 @@ -1625,7 +1623,7 @@ define amdgpu_kernel void @v_lshr_and(ptr addrspace(1) %out, i32 %a, i32 %b) #0 ; ; VI-LABEL: v_lshr_and: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1645,11 +1643,11 @@ define amdgpu_kernel void @v_lshr_and(ptr addrspace(1) %out, i32 %a, i32 %b) #0 define amdgpu_kernel void @and_lshr(ptr addrspace(1) %out, i32 %a) #0 { ; SI-LABEL: and_lshr: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_u32 s4, s2, 0x30006 +; SI-NEXT: s_bfe_u32 s4, s4, 0x30006 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -1657,8 +1655,8 @@ define amdgpu_kernel void @and_lshr(ptr addrspace(1) %out, i32 %a) #0 { ; ; VI-LABEL: and_lshr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1675,11 +1673,11 @@ define amdgpu_kernel void @and_lshr(ptr addrspace(1) %out, i32 %a) #0 { define amdgpu_kernel void @and_lshr2(ptr addrspace(1) %out, i32 %a) #0 { ; SI-LABEL: and_lshr2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_u32 s4, s2, 0x30006 +; SI-NEXT: s_bfe_u32 s4, s4, 0x30006 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -1687,8 +1685,8 @@ define amdgpu_kernel void @and_lshr2(ptr addrspace(1) %out, i32 %a) #0 { ; ; VI-LABEL: and_lshr2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1705,11 +1703,11 @@ define amdgpu_kernel void @and_lshr2(ptr addrspace(1) %out, i32 %a) #0 { define amdgpu_kernel void @shl_lshr(ptr addrspace(1) %out, i32 %a) #0 { ; SI-LABEL: shl_lshr: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_u32 s4, s2, 0x150002 +; SI-NEXT: s_bfe_u32 s4, s4, 0x150002 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -1717,8 +1715,8 @@ define amdgpu_kernel void @shl_lshr(ptr addrspace(1) %out, i32 %a) #0 { ; ; VI-LABEL: shl_lshr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll index 4ce0ff20e3b73..4bed23487445a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll @@ -12,33 +12,34 @@ define amdgpu_kernel void @s_exp_f32(ptr addrspace(1) %out, float %in) { ; VI-SDAG-LABEL: s_exp_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8a000 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: s_and_b32 s3, s2, 0xfffff000 -; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 -; VI-SDAG-NEXT: v_sub_f32_e32 v1, s2, v1 +; VI-SDAG-NEXT: s_and_b32 s0, s4, 0xfffff000 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s0 +; VI-SDAG-NEXT: v_sub_f32_e32 v1, s4, v1 ; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x39a3b295, v1 ; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3fb8a000, v1 -; VI-SDAG-NEXT: v_mul_f32_e32 v0, s3, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0 ; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v3 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x39a3b295 ; VI-SDAG-NEXT: v_rndne_f32_e32 v2, v0 -; VI-SDAG-NEXT: v_mul_f32_e32 v3, s3, v3 +; VI-SDAG-NEXT: v_mul_f32_e32 v3, s0, v3 ; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v2 ; VI-SDAG-NEXT: v_add_f32_e32 v1, v3, v1 ; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 ; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v1, v2 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x7f800000 ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc2ce8ed0 -; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v1 +; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v1 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42b17218 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v1 +; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v1 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 @@ -46,33 +47,34 @@ define amdgpu_kernel void @s_exp_f32(ptr addrspace(1) %out, float %in) { ; ; VI-GISEL-LABEL: s_exp_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x3fb8a000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x39a3b295 -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: s_and_b32 s3, s2, 0xfffff000 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, s3 -; VI-GISEL-NEXT: v_sub_f32_e32 v2, s2, v2 +; VI-GISEL-NEXT: s_and_b32 s0, s4, 0xfffff000 +; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; VI-GISEL-NEXT: v_sub_f32_e32 v2, s4, v2 ; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x39a3b295, v2 ; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3fb8a000, v2 -; VI-GISEL-NEXT: v_mul_f32_e32 v0, s3, v0 +; VI-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 ; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 -; VI-GISEL-NEXT: v_mul_f32_e32 v1, s3, v1 +; VI-GISEL-NEXT: v_mul_f32_e32 v1, s0, v1 ; VI-GISEL-NEXT: v_add_f32_e32 v1, v1, v2 ; VI-GISEL-NEXT: v_rndne_f32_e32 v2, v0 ; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v2 ; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 ; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v2 ; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 ; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2ce8ed0 -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v1 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42b17218 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v1 +; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 @@ -80,16 +82,16 @@ define amdgpu_kernel void @s_exp_f32(ptr addrspace(1) %out, float %in) { ; ; GFX900-SDAG-LABEL: s_exp_f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX900-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x32a5705f -; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s2, v0 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s4, v0 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v3, v2 -; GFX900-SDAG-NEXT: v_fma_f32 v0, s2, v0, -v2 +; GFX900-SDAG-NEXT: v_fma_f32 v0, s4, v0, -v2 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX900-SDAG-NEXT: v_fma_f32 v0, s2, v1, v0 +; GFX900-SDAG-NEXT: v_fma_f32 v0, s4, v1, v0 ; GFX900-SDAG-NEXT: v_add_f32_e32 v0, v2, v0 ; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v1, v3 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0 @@ -97,36 +99,36 @@ define amdgpu_kernel void @s_exp_f32(ptr addrspace(1) %out, float %in) { ; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0xc2ce8ed0 -; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v1 +; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v1 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42b17218 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v1 +; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v1 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc ; GFX900-SDAG-NEXT: global_store_dword v2, v0, s[0:1] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_exp_f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX900-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x32a5705f -; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s2, v0 -; GFX900-GISEL-NEXT: v_fma_f32 v0, s2, v0, -v2 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s4, v0 +; GFX900-GISEL-NEXT: v_fma_f32 v0, s4, v0, -v2 ; GFX900-GISEL-NEXT: v_rndne_f32_e32 v3, v2 -; GFX900-GISEL-NEXT: v_fma_f32 v0, s2, v1, v0 +; GFX900-GISEL-NEXT: v_fma_f32 v0, s4, v1, v0 ; GFX900-GISEL-NEXT: v_sub_f32_e32 v1, v2, v3 ; GFX900-GISEL-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX900-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v3 ; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0xc2ce8ed0 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v2 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v2 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 ; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42b17218 ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v1 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[0:1] @@ -134,10 +136,10 @@ define amdgpu_kernel void @s_exp_f32(ptr addrspace(1) %out, float %in) { ; ; SI-SDAG-LABEL: s_exp_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-SDAG-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x32a5705f -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SI-SDAG-NEXT: v_mul_f32_e32 v2, s4, v0 @@ -162,29 +164,29 @@ define amdgpu_kernel void @s_exp_f32(ptr addrspace(1) %out, float %in) { ; ; SI-GISEL-LABEL: s_exp_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-GISEL-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x32a5705f -; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: s_mov_b32 s2, -1 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_mul_f32_e32 v2, s2, v0 -; SI-GISEL-NEXT: v_fma_f32 v0, s2, v0, -v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v2, s4, v0 +; SI-GISEL-NEXT: v_fma_f32 v0, s4, v0, -v2 ; SI-GISEL-NEXT: v_rndne_f32_e32 v3, v2 -; SI-GISEL-NEXT: v_fma_f32 v0, s2, v1, v0 +; SI-GISEL-NEXT: v_fma_f32 v0, s4, v1, v0 ; SI-GISEL-NEXT: v_sub_f32_e32 v1, v2, v3 ; SI-GISEL-NEXT: v_add_f32_e32 v0, v1, v0 ; SI-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v3 ; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0xc2ce8ed0 -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v2 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v2 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 ; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42b17218 ; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v1 +; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; SI-GISEL-NEXT: s_mov_b32 s2, -1 +; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 ; SI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-GISEL-NEXT: s_endpgm ; @@ -336,7 +338,7 @@ define amdgpu_kernel void @s_exp_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; VI-SDAG-LABEL: s_exp_v2f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8a000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: s_and_b32 s4, s3, 0xfffff000 @@ -388,7 +390,7 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; VI-GISEL-LABEL: s_exp_v2f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x3fb8a000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x39a3b295 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -440,7 +442,7 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; GFX900-SDAG-LABEL: s_exp_v2f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x32a5705f ; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0xc2ce8ed0 @@ -479,7 +481,7 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; GFX900-GISEL-LABEL: s_exp_v2f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x32a5705f ; GFX900-GISEL-NEXT: v_mov_b32_e32 v6, 0x7f800000 @@ -518,7 +520,7 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; SI-SDAG-LABEL: s_exp_v2f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x32a5705f ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 @@ -560,7 +562,7 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; SI-GISEL-LABEL: s_exp_v2f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x32a5705f ; SI-GISEL-NEXT: v_mov_b32_e32 v6, 0x7f800000 @@ -851,25 +853,25 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) { define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; VI-SDAG-LABEL: s_exp_v3f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8a000 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: s_and_b32 s2, s6, 0xfffff000 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 +; VI-SDAG-NEXT: s_and_b32 s0, s6, 0xfffff000 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; VI-SDAG-NEXT: v_sub_f32_e32 v2, s6, v2 ; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x39a3b295, v2 ; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3fb8a000, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v1, s2, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v1, s0, v0 ; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x39a3b295 ; VI-SDAG-NEXT: v_rndne_f32_e32 v3, v1 -; VI-SDAG-NEXT: v_mul_f32_e32 v5, s2, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v5, s0, v4 ; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v3 ; VI-SDAG-NEXT: v_add_f32_e32 v2, v5, v2 ; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 ; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1 ; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v2, v3 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: s_and_b32 s2, s5, 0xfffff000 ; VI-SDAG-NEXT: v_mov_b32_e32 v7, s2 ; VI-SDAG-NEXT: v_sub_f32_e32 v7, s5, v7 @@ -915,6 +917,7 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v3 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v5 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v4, s1 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s0 @@ -923,19 +926,19 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; ; VI-GISEL-LABEL: s_exp_v3f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3fb8a000 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x39a3b295 -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: s_and_b32 s2, s4, 0xfffff000 -; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: s_and_b32 s0, s4, 0xfffff000 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; VI-GISEL-NEXT: v_sub_f32_e32 v0, s4, v0 ; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x39a3b295, v0 ; VI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0 -; VI-GISEL-NEXT: v_mul_f32_e32 v3, s2, v1 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, s0, v1 ; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v4 -; VI-GISEL-NEXT: v_mul_f32_e32 v4, s2, v2 +; VI-GISEL-NEXT: v_mul_f32_e32 v4, s0, v2 +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: s_and_b32 s2, s5, 0xfffff000 ; VI-GISEL-NEXT: v_mov_b32_e32 v5, s2 ; VI-GISEL-NEXT: v_sub_f32_e32 v5, s5, v5 @@ -987,6 +990,7 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v3 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc ; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s6, v4 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v4, s1 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s0 @@ -995,11 +999,11 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; ; GFX900-SDAG-LABEL: s_exp_v3f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x32a5705f ; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0x42b17218 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-SDAG-NEXT: v_mul_f32_e32 v6, s5, v0 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v7, v6 @@ -1048,10 +1052,10 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; ; GFX900-GISEL-LABEL: s_exp_v3f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3fb8aa3b ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x32a5705f -; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-GISEL-NEXT: v_mul_f32_e32 v5, s5, v1 ; GFX900-GISEL-NEXT: v_fma_f32 v6, s5, v1, -v5 @@ -1101,10 +1105,10 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; ; SI-SDAG-LABEL: s_exp_v3f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x32a5705f -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SI-SDAG-NEXT: v_mul_f32_e32 v5, s4, v0 @@ -1156,10 +1160,10 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; ; SI-GISEL-LABEL: s_exp_v3f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3fb8aa3b ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x32a5705f -; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-GISEL-NEXT: s_mov_b32 s2, -1 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: v_mul_f32_e32 v5, s5, v1 @@ -1590,26 +1594,26 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) { define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; VI-SDAG-LABEL: s_exp_v4f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8a000 ; VI-SDAG-NEXT: v_mov_b32_e32 v6, 0x42b17218 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: s_and_b32 s2, s7, 0xfffff000 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 +; VI-SDAG-NEXT: s_and_b32 s0, s7, 0xfffff000 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; VI-SDAG-NEXT: v_sub_f32_e32 v2, s7, v2 ; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x39a3b295, v2 ; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3fb8a000, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v1, s2, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v1, s0, v0 ; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x39a3b295 ; VI-SDAG-NEXT: v_rndne_f32_e32 v3, v1 -; VI-SDAG-NEXT: v_mul_f32_e32 v5, s2, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v5, s0, v4 ; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v3 ; VI-SDAG-NEXT: v_add_f32_e32 v2, v5, v2 ; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 ; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1 ; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v2, v3 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: s_and_b32 s2, s6, 0xfffff000 ; VI-SDAG-NEXT: v_mov_b32_e32 v7, s2 ; VI-SDAG-NEXT: v_sub_f32_e32 v7, s6, v7 @@ -1673,6 +1677,7 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v5 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v6 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v5, s1 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc ; VI-SDAG-NEXT: v_mov_b32_e32 v4, s0 @@ -1681,29 +1686,28 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; ; VI-GISEL-LABEL: s_exp_v4f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x3fb8a000 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x39a3b295 ; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x42b17218 -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: s_and_b32 s2, s4, 0xfffff000 -; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: s_and_b32 s0, s4, 0xfffff000 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; VI-GISEL-NEXT: v_sub_f32_e32 v0, s4, v0 ; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x39a3b295, v0 ; VI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0 -; VI-GISEL-NEXT: v_mul_f32_e32 v1, s2, v2 +; VI-GISEL-NEXT: v_mul_f32_e32 v1, s0, v2 ; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v4 -; VI-GISEL-NEXT: v_mul_f32_e32 v4, s2, v3 +; VI-GISEL-NEXT: v_mul_f32_e32 v4, s0, v3 ; VI-GISEL-NEXT: v_add_f32_e32 v0, v4, v0 ; VI-GISEL-NEXT: v_rndne_f32_e32 v4, v1 ; VI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v4 ; VI-GISEL-NEXT: v_add_f32_e32 v0, v1, v0 ; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v4 ; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: s_and_b32 s2, s5, 0xfffff000 ; VI-GISEL-NEXT: v_mul_f32_e32 v6, s2, v2 -; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xc2ce8ed0 ; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; VI-GISEL-NEXT: v_sub_f32_e32 v1, s5, v1 @@ -1719,7 +1723,7 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; VI-GISEL-NEXT: v_exp_f32_e32 v1, v1 ; VI-GISEL-NEXT: s_and_b32 s2, s6, 0xfffff000 ; VI-GISEL-NEXT: v_mul_f32_e32 v8, s2, v2 -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v4 +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xc2ce8ed0 ; VI-GISEL-NEXT: v_ldexp_f32 v1, v1, v6 ; VI-GISEL-NEXT: v_mov_b32_e32 v6, s2 ; VI-GISEL-NEXT: v_sub_f32_e32 v6, s6, v6 @@ -1744,6 +1748,7 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; VI-GISEL-NEXT: v_add_f32_e32 v8, v8, v9 ; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v8 ; VI-GISEL-NEXT: v_rndne_f32_e32 v8, v2 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v4 ; VI-GISEL-NEXT: v_sub_f32_e32 v2, v2, v8 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc ; VI-GISEL-NEXT: v_mov_b32_e32 v7, 0x7f800000 @@ -1764,6 +1769,7 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s7, v4 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc ; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s7, v5 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v5, s1 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc ; VI-GISEL-NEXT: v_mov_b32_e32 v4, s0 @@ -1772,11 +1778,11 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; ; GFX900-SDAG-LABEL: s_exp_v4f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x32a5705f ; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0xc2ce8ed0 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v6, 0x42b17218 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s7, v0 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v3, v2 @@ -1787,8 +1793,8 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v2, v2 ; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v5 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v6, 0x42b17218 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v9, 0x7f800000 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_ldexp_f32 v2, v2, v3 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, s6, v0 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v7, v3 @@ -1833,17 +1839,16 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v6 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc -; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_exp_v4f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x3fb8aa3b ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x32a5705f ; GFX900-GISEL-NEXT: v_mov_b32_e32 v5, 0x42b17218 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s4, v2 ; GFX900-GISEL-NEXT: v_fma_f32 v1, s4, v2, -v0 @@ -1905,11 +1910,11 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; ; SI-SDAG-LABEL: s_exp_v4f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x32a5705f ; SI-SDAG-NEXT: v_mov_b32_e32 v5, 0x42b17218 -; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0x7f800000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SI-SDAG-NEXT: v_mul_f32_e32 v2, s7, v0 ; SI-SDAG-NEXT: v_rndne_f32_e32 v3, v2 @@ -1921,7 +1926,7 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3 ; SI-SDAG-NEXT: v_mov_b32_e32 v4, 0xc2ce8ed0 ; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v4 -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0x7f800000 ; SI-SDAG-NEXT: v_ldexp_f32_e32 v2, v2, v3 ; SI-SDAG-NEXT: v_mul_f32_e32 v3, s6, v0 ; SI-SDAG-NEXT: v_rndne_f32_e32 v6, v3 @@ -1967,17 +1972,16 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s2, -1 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-SDAG-NEXT: s_endpgm ; ; SI-GISEL-LABEL: s_exp_v4f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x3fb8aa3b ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x32a5705f ; SI-GISEL-NEXT: v_mov_b32_e32 v5, 0x42b17218 -; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: v_mul_f32_e32 v0, s4, v2 ; SI-GISEL-NEXT: v_fma_f32 v1, s4, v2, -v0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll index 5ab960f47f57b..ec7e52532cd32 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll @@ -14,33 +14,34 @@ define amdgpu_kernel void @s_exp10_f32(ptr addrspace(1) %out, float %in) { ; VI-SDAG-LABEL: s_exp10_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549000 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: s_and_b32 s3, s2, 0xfffff000 -; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 -; VI-SDAG-NEXT: v_sub_f32_e32 v1, s2, v1 +; VI-SDAG-NEXT: s_and_b32 s0, s4, 0xfffff000 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s0 +; VI-SDAG-NEXT: v_sub_f32_e32 v1, s4, v1 ; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3a2784bc, v1 ; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x40549000, v1 -; VI-SDAG-NEXT: v_mul_f32_e32 v0, s3, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0 ; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v3 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x3a2784bc ; VI-SDAG-NEXT: v_rndne_f32_e32 v2, v0 -; VI-SDAG-NEXT: v_mul_f32_e32 v3, s3, v3 +; VI-SDAG-NEXT: v_mul_f32_e32 v3, s0, v3 ; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v2 ; VI-SDAG-NEXT: v_add_f32_e32 v1, v3, v1 ; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 ; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v1, v2 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x7f800000 ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc23369f4 -; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v1 +; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v1 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x421a209b ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v1 +; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v1 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 @@ -48,33 +49,34 @@ define amdgpu_kernel void @s_exp10_f32(ptr addrspace(1) %out, float %in) { ; ; VI-GISEL-LABEL: s_exp10_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x40549000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3a2784bc -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: s_and_b32 s3, s2, 0xfffff000 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, s3 -; VI-GISEL-NEXT: v_sub_f32_e32 v2, s2, v2 +; VI-GISEL-NEXT: s_and_b32 s0, s4, 0xfffff000 +; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; VI-GISEL-NEXT: v_sub_f32_e32 v2, s4, v2 ; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3a2784bc, v2 ; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x40549000, v2 -; VI-GISEL-NEXT: v_mul_f32_e32 v0, s3, v0 +; VI-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 ; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 -; VI-GISEL-NEXT: v_mul_f32_e32 v1, s3, v1 +; VI-GISEL-NEXT: v_mul_f32_e32 v1, s0, v1 ; VI-GISEL-NEXT: v_add_f32_e32 v1, v1, v2 ; VI-GISEL-NEXT: v_rndne_f32_e32 v2, v0 ; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v2 ; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 ; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v2 ; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 ; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc23369f4 -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v1 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x421a209b ; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v1 +; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 @@ -82,16 +84,16 @@ define amdgpu_kernel void @s_exp10_f32(ptr addrspace(1) %out, float %in) { ; ; GFX900-SDAG-LABEL: s_exp10_f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX900-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x33979a37 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s2, v0 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s4, v0 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v3, v2 -; GFX900-SDAG-NEXT: v_fma_f32 v0, s2, v0, -v2 +; GFX900-SDAG-NEXT: v_fma_f32 v0, s4, v0, -v2 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX900-SDAG-NEXT: v_fma_f32 v0, s2, v1, v0 +; GFX900-SDAG-NEXT: v_fma_f32 v0, s4, v1, v0 ; GFX900-SDAG-NEXT: v_add_f32_e32 v0, v2, v0 ; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v1, v3 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0 @@ -99,36 +101,36 @@ define amdgpu_kernel void @s_exp10_f32(ptr addrspace(1) %out, float %in) { ; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0xc23369f4 -; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v1 +; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v1 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x421a209b ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v1 +; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v1 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc ; GFX900-SDAG-NEXT: global_store_dword v2, v0, s[0:1] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_exp10_f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX900-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x33979a37 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s2, v0 -; GFX900-GISEL-NEXT: v_fma_f32 v0, s2, v0, -v2 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s4, v0 +; GFX900-GISEL-NEXT: v_fma_f32 v0, s4, v0, -v2 ; GFX900-GISEL-NEXT: v_rndne_f32_e32 v3, v2 -; GFX900-GISEL-NEXT: v_fma_f32 v0, s2, v1, v0 +; GFX900-GISEL-NEXT: v_fma_f32 v0, s4, v1, v0 ; GFX900-GISEL-NEXT: v_sub_f32_e32 v1, v2, v3 ; GFX900-GISEL-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX900-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v3 ; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0xc23369f4 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v2 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v2 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 ; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x421a209b ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v1 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[0:1] @@ -136,10 +138,10 @@ define amdgpu_kernel void @s_exp10_f32(ptr addrspace(1) %out, float %in) { ; ; SI-SDAG-LABEL: s_exp10_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-SDAG-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x33979a37 -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SI-SDAG-NEXT: v_mul_f32_e32 v2, s4, v0 @@ -164,29 +166,29 @@ define amdgpu_kernel void @s_exp10_f32(ptr addrspace(1) %out, float %in) { ; ; SI-GISEL-LABEL: s_exp10_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-GISEL-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x33979a37 -; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: s_mov_b32 s2, -1 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_mul_f32_e32 v2, s2, v0 -; SI-GISEL-NEXT: v_fma_f32 v0, s2, v0, -v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v2, s4, v0 +; SI-GISEL-NEXT: v_fma_f32 v0, s4, v0, -v2 ; SI-GISEL-NEXT: v_rndne_f32_e32 v3, v2 -; SI-GISEL-NEXT: v_fma_f32 v0, s2, v1, v0 +; SI-GISEL-NEXT: v_fma_f32 v0, s4, v1, v0 ; SI-GISEL-NEXT: v_sub_f32_e32 v1, v2, v3 ; SI-GISEL-NEXT: v_add_f32_e32 v0, v1, v0 ; SI-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v3 ; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0xc23369f4 -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v2 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v2 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 ; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x421a209b ; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v1 +; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; SI-GISEL-NEXT: s_mov_b32 s2, -1 +; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 ; SI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-GISEL-NEXT: s_endpgm ; @@ -338,7 +340,7 @@ define amdgpu_kernel void @s_exp10_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; VI-SDAG-LABEL: s_exp10_v2f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: s_and_b32 s4, s3, 0xfffff000 @@ -390,7 +392,7 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; VI-GISEL-LABEL: s_exp10_v2f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x40549000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3a2784bc ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -442,7 +444,7 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX900-SDAG-LABEL: s_exp10_v2f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x33979a37 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0xc23369f4 @@ -481,7 +483,7 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX900-GISEL-LABEL: s_exp10_v2f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x33979a37 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v6, 0x7f800000 @@ -520,7 +522,7 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; SI-SDAG-LABEL: s_exp10_v2f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x33979a37 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 @@ -562,7 +564,7 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; SI-GISEL-LABEL: s_exp10_v2f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x33979a37 ; SI-GISEL-NEXT: v_mov_b32_e32 v6, 0x7f800000 @@ -853,25 +855,25 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in) define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; VI-SDAG-LABEL: s_exp10_v3f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549000 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: s_and_b32 s2, s6, 0xfffff000 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 +; VI-SDAG-NEXT: s_and_b32 s0, s6, 0xfffff000 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; VI-SDAG-NEXT: v_sub_f32_e32 v2, s6, v2 ; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3a2784bc, v2 ; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x40549000, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v1, s2, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v1, s0, v0 ; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x3a2784bc ; VI-SDAG-NEXT: v_rndne_f32_e32 v3, v1 -; VI-SDAG-NEXT: v_mul_f32_e32 v5, s2, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v5, s0, v4 ; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v3 ; VI-SDAG-NEXT: v_add_f32_e32 v2, v5, v2 ; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 ; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1 ; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v2, v3 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: s_and_b32 s2, s5, 0xfffff000 ; VI-SDAG-NEXT: v_mov_b32_e32 v7, s2 ; VI-SDAG-NEXT: v_sub_f32_e32 v7, s5, v7 @@ -917,6 +919,7 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v3 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v5 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v4, s1 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s0 @@ -925,19 +928,19 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; VI-GISEL-LABEL: s_exp10_v3f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40549000 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x3a2784bc -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: s_and_b32 s2, s4, 0xfffff000 -; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: s_and_b32 s0, s4, 0xfffff000 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; VI-GISEL-NEXT: v_sub_f32_e32 v0, s4, v0 ; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3a2784bc, v0 ; VI-GISEL-NEXT: v_mul_f32_e32 v0, 0x40549000, v0 -; VI-GISEL-NEXT: v_mul_f32_e32 v3, s2, v1 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, s0, v1 ; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v4 -; VI-GISEL-NEXT: v_mul_f32_e32 v4, s2, v2 +; VI-GISEL-NEXT: v_mul_f32_e32 v4, s0, v2 +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: s_and_b32 s2, s5, 0xfffff000 ; VI-GISEL-NEXT: v_mov_b32_e32 v5, s2 ; VI-GISEL-NEXT: v_sub_f32_e32 v5, s5, v5 @@ -989,6 +992,7 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v3 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc ; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s6, v4 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v4, s1 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s0 @@ -997,11 +1001,11 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; GFX900-SDAG-LABEL: s_exp10_v3f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x33979a37 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0x421a209b -; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-SDAG-NEXT: v_mul_f32_e32 v6, s5, v0 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v7, v6 @@ -1050,10 +1054,10 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; GFX900-GISEL-LABEL: s_exp10_v3f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x40549a78 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x33979a37 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-GISEL-NEXT: v_mul_f32_e32 v5, s5, v1 ; GFX900-GISEL-NEXT: v_fma_f32 v6, s5, v1, -v5 @@ -1103,10 +1107,10 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; SI-SDAG-LABEL: s_exp10_v3f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x33979a37 -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SI-SDAG-NEXT: v_mul_f32_e32 v5, s4, v0 @@ -1158,10 +1162,10 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; SI-GISEL-LABEL: s_exp10_v3f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40549a78 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x33979a37 -; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-GISEL-NEXT: s_mov_b32 s2, -1 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: v_mul_f32_e32 v5, s5, v1 @@ -1592,26 +1596,26 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in) define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; VI-SDAG-LABEL: s_exp10_v4f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549000 ; VI-SDAG-NEXT: v_mov_b32_e32 v6, 0x421a209b -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: s_and_b32 s2, s7, 0xfffff000 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 +; VI-SDAG-NEXT: s_and_b32 s0, s7, 0xfffff000 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; VI-SDAG-NEXT: v_sub_f32_e32 v2, s7, v2 ; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3a2784bc, v2 ; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x40549000, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v1, s2, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v1, s0, v0 ; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x3a2784bc ; VI-SDAG-NEXT: v_rndne_f32_e32 v3, v1 -; VI-SDAG-NEXT: v_mul_f32_e32 v5, s2, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v5, s0, v4 ; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v3 ; VI-SDAG-NEXT: v_add_f32_e32 v2, v5, v2 ; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 ; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1 ; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v2, v3 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: s_and_b32 s2, s6, 0xfffff000 ; VI-SDAG-NEXT: v_mov_b32_e32 v7, s2 ; VI-SDAG-NEXT: v_sub_f32_e32 v7, s6, v7 @@ -1675,6 +1679,7 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v5 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v6 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v5, s1 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc ; VI-SDAG-NEXT: v_mov_b32_e32 v4, s0 @@ -1683,29 +1688,28 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; VI-GISEL-LABEL: s_exp10_v4f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x40549000 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3a2784bc ; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x421a209b -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: s_and_b32 s2, s4, 0xfffff000 -; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: s_and_b32 s0, s4, 0xfffff000 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; VI-GISEL-NEXT: v_sub_f32_e32 v0, s4, v0 ; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3a2784bc, v0 ; VI-GISEL-NEXT: v_mul_f32_e32 v0, 0x40549000, v0 -; VI-GISEL-NEXT: v_mul_f32_e32 v1, s2, v2 +; VI-GISEL-NEXT: v_mul_f32_e32 v1, s0, v2 ; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v4 -; VI-GISEL-NEXT: v_mul_f32_e32 v4, s2, v3 +; VI-GISEL-NEXT: v_mul_f32_e32 v4, s0, v3 ; VI-GISEL-NEXT: v_add_f32_e32 v0, v4, v0 ; VI-GISEL-NEXT: v_rndne_f32_e32 v4, v1 ; VI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v4 ; VI-GISEL-NEXT: v_add_f32_e32 v0, v1, v0 ; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v4 ; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: s_and_b32 s2, s5, 0xfffff000 ; VI-GISEL-NEXT: v_mul_f32_e32 v6, s2, v2 -; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xc23369f4 ; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; VI-GISEL-NEXT: v_sub_f32_e32 v1, s5, v1 @@ -1721,7 +1725,7 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; VI-GISEL-NEXT: v_exp_f32_e32 v1, v1 ; VI-GISEL-NEXT: s_and_b32 s2, s6, 0xfffff000 ; VI-GISEL-NEXT: v_mul_f32_e32 v8, s2, v2 -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v4 +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xc23369f4 ; VI-GISEL-NEXT: v_ldexp_f32 v1, v1, v6 ; VI-GISEL-NEXT: v_mov_b32_e32 v6, s2 ; VI-GISEL-NEXT: v_sub_f32_e32 v6, s6, v6 @@ -1746,6 +1750,7 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; VI-GISEL-NEXT: v_add_f32_e32 v8, v8, v9 ; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v8 ; VI-GISEL-NEXT: v_rndne_f32_e32 v8, v2 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v4 ; VI-GISEL-NEXT: v_sub_f32_e32 v2, v2, v8 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc ; VI-GISEL-NEXT: v_mov_b32_e32 v7, 0x7f800000 @@ -1766,6 +1771,7 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s7, v4 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc ; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s7, v5 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v5, s1 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc ; VI-GISEL-NEXT: v_mov_b32_e32 v4, s0 @@ -1774,11 +1780,11 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; GFX900-SDAG-LABEL: s_exp10_v4f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x33979a37 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0xc23369f4 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v6, 0x421a209b ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s7, v0 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v3, v2 @@ -1789,8 +1795,8 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v2, v2 ; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v5 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v6, 0x421a209b ; GFX900-SDAG-NEXT: v_mov_b32_e32 v9, 0x7f800000 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_ldexp_f32 v2, v2, v3 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, s6, v0 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v7, v3 @@ -1835,17 +1841,16 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v6 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc -; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_exp10_v4f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x40549a78 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x33979a37 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v5, 0x421a209b -; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s4, v2 ; GFX900-GISEL-NEXT: v_fma_f32 v1, s4, v2, -v0 @@ -1907,11 +1912,11 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; SI-SDAG-LABEL: s_exp10_v4f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x33979a37 ; SI-SDAG-NEXT: v_mov_b32_e32 v5, 0x421a209b -; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0x7f800000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SI-SDAG-NEXT: v_mul_f32_e32 v2, s7, v0 ; SI-SDAG-NEXT: v_rndne_f32_e32 v3, v2 @@ -1923,7 +1928,7 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3 ; SI-SDAG-NEXT: v_mov_b32_e32 v4, 0xc23369f4 ; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v4 -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0x7f800000 ; SI-SDAG-NEXT: v_ldexp_f32_e32 v2, v2, v3 ; SI-SDAG-NEXT: v_mul_f32_e32 v3, s6, v0 ; SI-SDAG-NEXT: v_rndne_f32_e32 v6, v3 @@ -1969,17 +1974,16 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s2, -1 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-SDAG-NEXT: s_endpgm ; ; SI-GISEL-LABEL: s_exp10_v4f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x40549a78 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x33979a37 ; SI-GISEL-NEXT: v_mov_b32_e32 v5, 0x421a209b -; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: v_mul_f32_e32 v0, s4, v2 ; SI-GISEL-NEXT: v_fma_f32 v1, s4, v2, -v0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll index 6cca705f7b1db..acbb8684da924 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll @@ -12,17 +12,17 @@ define amdgpu_kernel void @s_exp2_f32(ptr addrspace(1) %out, float %in) { ; SI-SDAG-LABEL: s_exp2_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; SI-SDAG-NEXT: v_add_f32_e32 v1, s2, v1 +; SI-SDAG-NEXT: v_add_f32_e32 v1, s4, v1 ; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1 ; SI-SDAG-NEXT: s_mov_b32 s2, -1 ; SI-SDAG-NEXT: v_mul_f32_e32 v0, v1, v0 @@ -31,35 +31,35 @@ define amdgpu_kernel void @s_exp2_f32(ptr addrspace(1) %out, float %in) { ; ; SI-GISEL-LABEL: s_exp2_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000 -; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: s_mov_b32 s2, -1 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; SI-GISEL-NEXT: v_add_f32_e32 v0, s2, v0 +; SI-GISEL-NEXT: v_add_f32_e32 v0, s4, v0 ; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; SI-GISEL-NEXT: s_mov_b32 s2, -1 +; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 ; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; SI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: s_exp2_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; VI-SDAG-NEXT: v_add_f32_e32 v1, s2, v1 +; VI-SDAG-NEXT: v_add_f32_e32 v1, s4, v1 ; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1 ; VI-SDAG-NEXT: v_mul_f32_e32 v2, v1, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -69,14 +69,14 @@ define amdgpu_kernel void @s_exp2_f32(ptr addrspace(1) %out, float %in) { ; ; VI-GISEL-LABEL: s_exp2_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; VI-GISEL-NEXT: v_add_f32_e32 v0, s2, v0 +; VI-GISEL-NEXT: v_add_f32_e32 v0, s4, v0 ; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc @@ -88,8 +88,8 @@ define amdgpu_kernel void @s_exp2_f32(ptr addrspace(1) %out, float %in) { ; ; GFX900-SDAG-LABEL: s_exp2_f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX900-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0 @@ -101,24 +101,25 @@ define amdgpu_kernel void @s_exp2_f32(ptr addrspace(1) %out, float %in) { ; GFX900-SDAG-NEXT: v_add_f32_e32 v1, s4, v1 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v1, v1 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, v1, v0 -; GFX900-SDAG-NEXT: global_store_dword v2, v0, s[2:3] +; GFX900-SDAG-NEXT: global_store_dword v2, v0, s[0:1] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_exp2_f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX900-GISEL-NEXT: s_load_dword s0, s[2:3], 0x2c ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; GFX900-GISEL-NEXT: v_add_f32_e32 v0, s2, v0 +; GFX900-GISEL-NEXT: v_add_f32_e32 v0, s0, v0 ; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc ; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX900-GISEL-NEXT: s_endpgm ; @@ -172,7 +173,7 @@ define amdgpu_kernel void @s_exp2_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; SI-SDAG-LABEL: s_exp2_v2f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000 @@ -198,7 +199,7 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; SI-GISEL-LABEL: s_exp2_v2f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x1f800000 @@ -222,7 +223,7 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; VI-SDAG-LABEL: s_exp2_v2f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 @@ -246,7 +247,7 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; VI-GISEL-LABEL: s_exp2_v2f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x1f800000 @@ -270,7 +271,7 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX900-SDAG-LABEL: s_exp2_v2f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 @@ -293,7 +294,7 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX900-GISEL-LABEL: s_exp2_v2f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x1f800000 @@ -380,8 +381,8 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in) define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; SI-SDAG-LABEL: s_exp2_v3f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000 @@ -412,8 +413,8 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; SI-GISEL-LABEL: s_exp2_v3f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x1f800000 @@ -444,8 +445,8 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; VI-SDAG-LABEL: s_exp2_v3f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000 @@ -475,11 +476,11 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; VI-GISEL-LABEL: s_exp2_v3f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x1f800000 +; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc @@ -506,8 +507,8 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; GFX900-SDAG-LABEL: s_exp2_v3f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000 @@ -531,16 +532,16 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, v4, v2 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, v6, v5 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, v3, v0 -; GFX900-SDAG-NEXT: global_store_dwordx3 v7, v[0:2], s[2:3] +; GFX900-SDAG-NEXT: global_store_dwordx3 v7, v[0:2], s[0:1] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_exp2_v3f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x1f800000 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc @@ -655,45 +656,45 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in) define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; SI-SDAG-LABEL: s_exp2_v4f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000 -; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v5, 1.0, v1, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v3, vcc -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s1, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v7, 1.0, v1, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v8, 0, v3, vcc -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc -; SI-SDAG-NEXT: v_add_f32_e32 v4, s3, v4 -; SI-SDAG-NEXT: v_add_f32_e32 v6, s2, v6 -; SI-SDAG-NEXT: v_add_f32_e32 v8, s1, v8 -; SI-SDAG-NEXT: v_add_f32_e32 v1, s0, v1 +; SI-SDAG-NEXT: v_add_f32_e32 v4, s7, v4 +; SI-SDAG-NEXT: v_add_f32_e32 v6, s6, v6 +; SI-SDAG-NEXT: v_add_f32_e32 v8, s5, v8 +; SI-SDAG-NEXT: v_add_f32_e32 v1, s4, v1 ; SI-SDAG-NEXT: v_exp_f32_e32 v4, v4 ; SI-SDAG-NEXT: v_exp_f32_e32 v6, v6 ; SI-SDAG-NEXT: v_exp_f32_e32 v8, v8 ; SI-SDAG-NEXT: v_exp_f32_e32 v9, v1 -; SI-SDAG-NEXT: s_mov_b32 s6, -1 +; SI-SDAG-NEXT: s_mov_b32 s2, -1 ; SI-SDAG-NEXT: v_mul_f32_e32 v3, v4, v2 ; SI-SDAG-NEXT: v_mul_f32_e32 v2, v6, v5 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, v8, v7 ; SI-SDAG-NEXT: v_mul_f32_e32 v0, v9, v0 -; SI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-SDAG-NEXT: s_endpgm ; ; SI-GISEL-LABEL: s_exp2_v4f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0xc2fc0000 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x42800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x1f800000 @@ -729,8 +730,8 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; VI-SDAG-LABEL: s_exp2_v4f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000 @@ -766,8 +767,8 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; VI-GISEL-LABEL: s_exp2_v4f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0xc2fc0000 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x42800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x1f800000 @@ -803,8 +804,8 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; GFX900-SDAG-LABEL: s_exp2_v4f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000 @@ -834,13 +835,13 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, v7, v6 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, v9, v8 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, v10, v0 -; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_exp2_v4f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0xc2fc0000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x42800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x1f800000 @@ -870,7 +871,7 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, v5, v2 ; GFX900-GISEL-NEXT: v_mul_f32_e32 v3, v3, v4 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX900-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] ; GFX900-GISEL-NEXT: s_endpgm ; ; R600-LABEL: s_exp2_v4f32: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll index d847af780acab..c2f6fbfe4667c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll @@ -14,17 +14,17 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; SI-SDAG-LABEL: s_log_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dword s0, s[2:3], 0xb +; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; SI-SDAG-NEXT: s_mov_b32 s0, 0x3f317217 +; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s0, 0x3f317217 ; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 ; SI-SDAG-NEXT: v_fma_f32 v2, v0, s0, -v1 @@ -42,15 +42,15 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; ; SI-GISEL-LABEL: s_log_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dword s0, s[2:3], 0xb +; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x3377d1cf ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; SI-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 ; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x7f800000 @@ -70,15 +70,15 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; ; VI-SDAG-LABEL: s_log_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dword s0, s[2:3], 0x2c ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-SDAG-NEXT: s_mov_b32 s0, 0x7f800000 ; VI-SDAG-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 ; VI-SDAG-NEXT: v_sub_f32_e32 v2, v0, v1 @@ -94,7 +94,6 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x41b17218 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; VI-SDAG-NEXT: v_sub_f32_e32 v2, v0, v1 -; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 @@ -102,15 +101,15 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; ; VI-GISEL-LABEL: s_log_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dword s0, s[2:3], 0x2c ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; VI-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 +; VI-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 ; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 ; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1 ; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v1 @@ -126,7 +125,6 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x41b17218 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1 -; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 @@ -134,17 +132,17 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; ; GFX900-SDAG-LABEL: s_log_f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX900-SDAG-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX900-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; GFX900-SDAG-NEXT: s_mov_b32 s0, 0x3f317217 +; GFX900-SDAG-NEXT: s_mov_b32 s1, 0x3377d1cf ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, s4, v0 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 -; GFX900-SDAG-NEXT: s_mov_b32 s1, 0x3377d1cf +; GFX900-SDAG-NEXT: s_mov_b32 s0, 0x3f317217 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317217, v0 ; GFX900-SDAG-NEXT: v_fma_f32 v3, v0, s0, -v2 @@ -156,20 +154,20 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x41b17218 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v2 -; GFX900-SDAG-NEXT: global_store_dword v1, v0, s[2:3] +; GFX900-SDAG-NEXT: global_store_dword v1, v0, s[4:5] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_log_f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX900-GISEL-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX900-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x3377d1cf ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s4, v0 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 ; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x7f800000 @@ -183,19 +181,18 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[2:3] +; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX900-GISEL-NEXT: s_endpgm ; ; GFX1100-SDAG-LABEL: s_log_f32: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1100-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x2c ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s2 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s0 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s3 -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4 +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff @@ -207,8 +204,9 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; GFX1100-SDAG-NEXT: v_dual_add_f32 v1, v1, v2 :: v_dual_mov_b32 v2, 0 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, s3 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, s4 ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] ; GFX1100-SDAG-NEXT: s_nop 0 ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -216,14 +214,13 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; ; GFX1100-GISEL-LABEL: s_log_f32: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1100-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x2c ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s2 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s0 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s3 -; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4 +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff @@ -233,10 +230,11 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3377d1cf, v0 ; GFX1100-GISEL-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 0x41b17218, s3 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 0x41b17218, s4 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_dual_cndmask_b32 v0, v0, v1 :: v_dual_mov_b32 v1, 0 ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX1100-GISEL-NEXT: s_nop 0 ; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -318,7 +316,7 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; SI-SDAG-LABEL: s_log_v2f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-SDAG-NEXT: s_mov_b32 s8, 0x3377d1cf @@ -359,7 +357,7 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; SI-GISEL-LABEL: s_log_v2f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3f317217 @@ -398,7 +396,7 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; VI-SDAG-LABEL: s_log_v2f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-SDAG-NEXT: s_mov_b32 s2, 0x7f800000 @@ -445,7 +443,7 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; VI-GISEL-LABEL: s_log_v2f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x7f800000 @@ -492,7 +490,7 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; GFX900-SDAG-LABEL: s_log_v2f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-SDAG-NEXT: s_mov_b32 s2, 0x3f317217 @@ -530,7 +528,7 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; GFX900-GISEL-LABEL: s_log_v2f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3f317217 @@ -568,7 +566,7 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; GFX1100-SDAG-LABEL: s_log_v2f32: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s3 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s5, 0x800000, s2 @@ -603,7 +601,7 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; GFX1100-GISEL-LABEL: s_log_v2f32: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s2 ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s5, 0x800000, s3 @@ -749,8 +747,8 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; SI-SDAG-LABEL: s_log_v3f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd -; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd +; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -802,8 +800,8 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; ; SI-GISEL-LABEL: s_log_v3f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd +; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3f317217 @@ -855,7 +853,7 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; ; VI-SDAG-LABEL: s_log_v3f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-SDAG-NEXT: s_mov_b32 s8, 0x7f800000 @@ -864,7 +862,7 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc ; VI-SDAG-NEXT: v_mul_f32_e32 v2, s6, v2 ; VI-SDAG-NEXT: v_log_f32_e32 v2, v2 -; VI-SDAG-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x24 ; VI-SDAG-NEXT: v_and_b32_e32 v3, 0xfffff000, v2 ; VI-SDAG-NEXT: v_sub_f32_e32 v4, v2, v3 ; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3805fdf4, v3 @@ -921,8 +919,8 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; ; VI-GISEL-LABEL: s_log_v3f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; VI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -986,8 +984,8 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; ; GFX900-SDAG-LABEL: s_log_v3f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1037,8 +1035,8 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; ; GFX900-GISEL-LABEL: s_log_v3f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3f317217 @@ -1089,19 +1087,19 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; GFX1100-SDAG-LABEL: s_log_v3f32: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s6 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s4 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s6 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s4 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s7 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s8 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s7 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s2 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 0x41b17218, s3 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s9 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s7 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 0x41b17218, s8 ; GFX1100-SDAG-NEXT: v_dual_mul_f32 v0, s6, v0 :: v_dual_mul_f32 v1, s5, v1 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 @@ -1122,7 +1120,7 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_fmac_f32_e32 v7, 0x3377d1cf, v1 ; GFX1100-SDAG-NEXT: v_add_f32_e32 v3, v3, v6 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 0x41b17218, s7 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 0x41b17218, s9 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_add_f32_e32 v4, v4, v7 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo @@ -1145,19 +1143,19 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; GFX1100-GISEL-LABEL: s_log_v3f32: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s4 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s6 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s4 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s6 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s7 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s8 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s7 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 0x41b17218, s3 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s2 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s9 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 0x41b17218, s8 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s7 ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 @@ -1178,7 +1176,7 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v7, 0x3377d1cf, v1 ; GFX1100-GISEL-NEXT: v_add_f32_e32 v3, v3, v6 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 0x41b17218, s7 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 0x41b17218, s9 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-GISEL-NEXT: v_add_f32_e32 v4, v4, v7 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo @@ -1355,8 +1353,8 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; SI-SDAG-LABEL: s_log_v4f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd -; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd +; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-SDAG-NEXT: s_mov_b32 s12, 0x3377d1cf @@ -1419,8 +1417,8 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; ; SI-GISEL-LABEL: s_log_v4f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd +; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x3f317217 @@ -1483,8 +1481,8 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; ; VI-SDAG-LABEL: s_log_v4f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; VI-SDAG-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-SDAG-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1565,8 +1563,8 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; ; VI-GISEL-LABEL: s_log_v4f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; VI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1647,8 +1645,8 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; ; GFX900-SDAG-LABEL: s_log_v4f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-SDAG-NEXT: s_mov_b32 s10, 0x3377d1cf @@ -1710,8 +1708,8 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; ; GFX900-GISEL-LABEL: s_log_v4f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x3f317217 @@ -1774,32 +1772,32 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; GFX1100-SDAG-LABEL: s_log_v4f32: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s7 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s6 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s4 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s7 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s6 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s10, 0x800000, s5 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s11, 0x800000, s4 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s8 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s9 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s8 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s9 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x41b17218, s2 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s10 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s11 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x41b17218, s8 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_dual_mul_f32 v0, s7, v0 :: v_dual_mul_f32 v1, s6, v1 ; GFX1100-SDAG-NEXT: v_dual_mul_f32 v2, s5, v2 :: v_dual_mul_f32 v3, s4, v3 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s3 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s9 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(TRANS32_DEP_3) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, v2 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v3, v3 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v14, 0, 0x41b17218, s8 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v15, 0, 0x41b17218, s9 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v14, 0, 0x41b17218, s10 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v15, 0, 0x41b17218, s11 ; GFX1100-SDAG-NEXT: v_dual_mul_f32 v5, 0x3f317217, v0 :: v_dual_mul_f32 v6, 0x3f317217, v1 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_dual_mul_f32 v7, 0x3f317217, v2 :: v_dual_mul_f32 v8, 0x3f317217, v3 @@ -1835,32 +1833,32 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; GFX1100-GISEL-LABEL: s_log_v4f32: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s4 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s6 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s7 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s4 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s5 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s10, 0x800000, s6 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s11, 0x800000, s7 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s8 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s9 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s8 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s9 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x41b17218, s2 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s10 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s11 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x41b17218, s8 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1 ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v2, s6, v2 :: v_dual_mul_f32 v3, s7, v3 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s3 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s9 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(TRANS32_DEP_3) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v2, v2 ; GFX1100-GISEL-NEXT: v_log_f32_e32 v3, v3 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 0x41b17218, s8 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 0x41b17218, s9 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 0x41b17218, s10 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 0x41b17218, s11 ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v5, 0x3f317217, v0 :: v_dual_mul_f32 v6, 0x3f317217, v1 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v7, 0x3f317217, v2 :: v_dual_mul_f32 v8, 0x3f317217, v3 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll index 3f060de9f6596..0a1f7ab6fc0ae 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll @@ -14,17 +14,17 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; SI-SDAG-LABEL: s_log10_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dword s0, s[2:3], 0xb +; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; SI-SDAG-NEXT: s_mov_b32 s0, 0x3e9a209a +; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s0, 0x3e9a209a ; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 ; SI-SDAG-NEXT: v_fma_f32 v2, v0, s0, -v1 @@ -42,15 +42,15 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; ; SI-GISEL-LABEL: s_log10_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dword s0, s[2:3], 0xb +; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x3284fbcf ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; SI-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 ; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x7f800000 @@ -70,15 +70,15 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; ; VI-SDAG-LABEL: s_log10_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dword s0, s[2:3], 0x2c ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-SDAG-NEXT: s_mov_b32 s0, 0x7f800000 ; VI-SDAG-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 ; VI-SDAG-NEXT: v_sub_f32_e32 v2, v0, v1 @@ -94,7 +94,6 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x411a209b ; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; VI-SDAG-NEXT: v_sub_f32_e32 v2, v0, v1 -; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 @@ -102,15 +101,15 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; ; VI-GISEL-LABEL: s_log10_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dword s0, s[2:3], 0x2c ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; VI-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 +; VI-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 ; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 ; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1 ; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v1 @@ -126,7 +125,6 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x411a209b ; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1 -; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 @@ -134,17 +132,17 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; ; GFX900-SDAG-LABEL: s_log10_f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX900-SDAG-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX900-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; GFX900-SDAG-NEXT: s_mov_b32 s0, 0x3e9a209a +; GFX900-SDAG-NEXT: s_mov_b32 s1, 0x3284fbcf ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, s4, v0 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 -; GFX900-SDAG-NEXT: s_mov_b32 s1, 0x3284fbcf +; GFX900-SDAG-NEXT: s_mov_b32 s0, 0x3e9a209a ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209a, v0 ; GFX900-SDAG-NEXT: v_fma_f32 v3, v0, s0, -v2 @@ -156,20 +154,20 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x411a209b ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v2 -; GFX900-SDAG-NEXT: global_store_dword v1, v0, s[2:3] +; GFX900-SDAG-NEXT: global_store_dword v1, v0, s[4:5] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_log10_f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX900-GISEL-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX900-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x3284fbcf ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s4, v0 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 ; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x7f800000 @@ -183,19 +181,18 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[2:3] +; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX900-GISEL-NEXT: s_endpgm ; ; GFX1100-SDAG-LABEL: s_log10_f32: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1100-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x2c ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s2 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s0 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s3 -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4 +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff @@ -207,8 +204,9 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; GFX1100-SDAG-NEXT: v_dual_add_f32 v1, v1, v2 :: v_dual_mov_b32 v2, 0 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, s3 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, s4 ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] ; GFX1100-SDAG-NEXT: s_nop 0 ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -216,14 +214,13 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; ; GFX1100-GISEL-LABEL: s_log10_f32: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1100-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x2c ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s2 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s0 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s3 -; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4 +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff @@ -233,10 +230,11 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3284fbcf, v0 ; GFX1100-GISEL-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 0x411a209b, s3 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 0x411a209b, s4 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_dual_cndmask_b32 v0, v0, v1 :: v_dual_mov_b32 v1, 0 ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX1100-GISEL-NEXT: s_nop 0 ; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -318,7 +316,7 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; SI-SDAG-LABEL: s_log10_v2f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-SDAG-NEXT: s_mov_b32 s8, 0x3284fbcf @@ -359,7 +357,7 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; SI-GISEL-LABEL: s_log10_v2f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3e9a209a @@ -398,7 +396,7 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; VI-SDAG-LABEL: s_log10_v2f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-SDAG-NEXT: s_mov_b32 s2, 0x7f800000 @@ -445,7 +443,7 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; VI-GISEL-LABEL: s_log10_v2f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x7f800000 @@ -492,7 +490,7 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX900-SDAG-LABEL: s_log10_v2f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-SDAG-NEXT: s_mov_b32 s2, 0x3e9a209a @@ -530,7 +528,7 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX900-GISEL-LABEL: s_log10_v2f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3e9a209a @@ -568,7 +566,7 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX1100-SDAG-LABEL: s_log10_v2f32: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s3 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s5, 0x800000, s2 @@ -603,7 +601,7 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX1100-GISEL-LABEL: s_log10_v2f32: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s2 ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s5, 0x800000, s3 @@ -749,8 +747,8 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; SI-SDAG-LABEL: s_log10_v3f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd -; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd +; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -802,8 +800,8 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; SI-GISEL-LABEL: s_log10_v3f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd +; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3e9a209a @@ -855,7 +853,7 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; VI-SDAG-LABEL: s_log10_v3f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-SDAG-NEXT: s_mov_b32 s8, 0x7f800000 @@ -864,7 +862,7 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc ; VI-SDAG-NEXT: v_mul_f32_e32 v2, s6, v2 ; VI-SDAG-NEXT: v_log_f32_e32 v2, v2 -; VI-SDAG-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x24 ; VI-SDAG-NEXT: v_and_b32_e32 v3, 0xfffff000, v2 ; VI-SDAG-NEXT: v_sub_f32_e32 v4, v2, v3 ; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x369a84fb, v3 @@ -921,8 +919,8 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; VI-GISEL-LABEL: s_log10_v3f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; VI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -986,8 +984,8 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; GFX900-SDAG-LABEL: s_log10_v3f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1037,8 +1035,8 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; GFX900-GISEL-LABEL: s_log10_v3f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3e9a209a @@ -1089,19 +1087,19 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX1100-SDAG-LABEL: s_log10_v3f32: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s6 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s4 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s6 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s4 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s7 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s8 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s7 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s2 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 0x411a209b, s3 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s9 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s7 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 0x411a209b, s8 ; GFX1100-SDAG-NEXT: v_dual_mul_f32 v0, s6, v0 :: v_dual_mul_f32 v1, s5, v1 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 @@ -1122,7 +1120,7 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_fmac_f32_e32 v7, 0x3284fbcf, v1 ; GFX1100-SDAG-NEXT: v_add_f32_e32 v3, v3, v6 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 0x411a209b, s7 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 0x411a209b, s9 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_add_f32_e32 v4, v4, v7 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo @@ -1145,19 +1143,19 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX1100-GISEL-LABEL: s_log10_v3f32: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s4 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s6 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s4 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s6 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s7 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s8 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s7 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 0x411a209b, s3 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s2 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s9 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 0x411a209b, s8 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s7 ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 @@ -1178,7 +1176,7 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v7, 0x3284fbcf, v1 ; GFX1100-GISEL-NEXT: v_add_f32_e32 v3, v3, v6 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 0x411a209b, s7 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 0x411a209b, s9 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-GISEL-NEXT: v_add_f32_e32 v4, v4, v7 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo @@ -1355,8 +1353,8 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; SI-SDAG-LABEL: s_log10_v4f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd -; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd +; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-SDAG-NEXT: s_mov_b32 s12, 0x3284fbcf @@ -1419,8 +1417,8 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; SI-GISEL-LABEL: s_log10_v4f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd +; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x3e9a209a @@ -1483,8 +1481,8 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; VI-SDAG-LABEL: s_log10_v4f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; VI-SDAG-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-SDAG-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1565,8 +1563,8 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; VI-GISEL-LABEL: s_log10_v4f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; VI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1647,8 +1645,8 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; GFX900-SDAG-LABEL: s_log10_v4f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-SDAG-NEXT: s_mov_b32 s10, 0x3284fbcf @@ -1710,8 +1708,8 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; GFX900-GISEL-LABEL: s_log10_v4f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x3e9a209a @@ -1774,32 +1772,32 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX1100-SDAG-LABEL: s_log10_v4f32: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s7 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s6 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s4 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s7 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s6 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s10, 0x800000, s5 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s11, 0x800000, s4 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s8 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s9 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s8 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s9 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x411a209b, s2 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s10 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s11 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x411a209b, s8 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_dual_mul_f32 v0, s7, v0 :: v_dual_mul_f32 v1, s6, v1 ; GFX1100-SDAG-NEXT: v_dual_mul_f32 v2, s5, v2 :: v_dual_mul_f32 v3, s4, v3 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s3 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s9 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(TRANS32_DEP_3) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, v2 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v3, v3 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v14, 0, 0x411a209b, s8 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v15, 0, 0x411a209b, s9 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v14, 0, 0x411a209b, s10 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v15, 0, 0x411a209b, s11 ; GFX1100-SDAG-NEXT: v_dual_mul_f32 v5, 0x3e9a209a, v0 :: v_dual_mul_f32 v6, 0x3e9a209a, v1 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_dual_mul_f32 v7, 0x3e9a209a, v2 :: v_dual_mul_f32 v8, 0x3e9a209a, v3 @@ -1835,32 +1833,32 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX1100-GISEL-LABEL: s_log10_v4f32: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s4 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s6 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s7 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s4 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s5 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s10, 0x800000, s6 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s11, 0x800000, s7 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s8 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s9 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s8 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s9 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x411a209b, s2 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s10 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s11 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x411a209b, s8 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1 ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v2, s6, v2 :: v_dual_mul_f32 v3, s7, v3 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s3 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s9 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(TRANS32_DEP_3) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v2, v2 ; GFX1100-GISEL-NEXT: v_log_f32_e32 v3, v3 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 0x411a209b, s8 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 0x411a209b, s9 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 0x411a209b, s10 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 0x411a209b, s11 ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v5, 0x3e9a209a, v0 :: v_dual_mul_f32 v6, 0x3e9a209a, v1 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v7, 0x3e9a209a, v2 :: v_dual_mul_f32 v8, 0x3e9a209a, v3 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll index 90a15ae8d9b28..bc4d03e7af260 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll @@ -14,17 +14,17 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) { ; SI-SDAG-LABEL: s_log2_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v1, s2, v1 +; SI-SDAG-NEXT: v_mul_f32_e32 v1, s4, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 ; SI-SDAG-NEXT: s_mov_b32 s2, -1 ; SI-SDAG-NEXT: v_sub_f32_e32 v0, v1, v0 @@ -33,35 +33,35 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) { ; ; SI-GISEL-LABEL: s_log2_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: s_mov_b32 s2, -1 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; SI-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, s4, v0 ; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; SI-GISEL-NEXT: s_mov_b32 s2, -1 +; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 ; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 ; SI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: s_log2_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v1, s2, v1 +; VI-SDAG-NEXT: v_mul_f32_e32 v1, s4, v1 ; VI-SDAG-NEXT: v_log_f32_e32 v1, v1 ; VI-SDAG-NEXT: v_sub_f32_e32 v2, v1, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -71,14 +71,14 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) { ; ; VI-GISEL-LABEL: s_log2_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; VI-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 +; VI-GISEL-NEXT: v_mul_f32_e32 v0, s4, v0 ; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -90,8 +90,8 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) { ; ; GFX900-SDAG-LABEL: s_log2_f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX900-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0 @@ -103,43 +103,44 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) { ; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, s4, v1 ; GFX900-SDAG-NEXT: v_log_f32_e32 v1, v1 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v1, v0 -; GFX900-SDAG-NEXT: global_store_dword v2, v0, s[2:3] +; GFX900-SDAG-NEXT: global_store_dword v2, v0, s[0:1] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_log2_f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX900-GISEL-NEXT: s_load_dword s0, s[2:3], 0x2c ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 ; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX900-GISEL-NEXT: s_endpgm ; ; GFX1100-SDAG-LABEL: s_log2_f32: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1100-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x2c ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s3 -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, s2, v1 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, s4 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s0 +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, s4, v1 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v1, v0 +; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] ; GFX1100-SDAG-NEXT: s_nop 0 ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -147,19 +148,19 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) { ; ; GFX1100-GISEL-LABEL: s_log2_f32: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1100-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x2c ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s2 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s0 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s3 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s3 -; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s4 +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_dual_sub_f32 v0, v0, v1 :: v_dual_mov_b32 v1, 0 +; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX1100-GISEL-NEXT: s_nop 0 ; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -215,7 +216,7 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; SI-SDAG-LABEL: s_log2_v2f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000 @@ -241,7 +242,7 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; SI-GISEL-LABEL: s_log2_v2f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42000000 @@ -265,7 +266,7 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; VI-SDAG-LABEL: s_log2_v2f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 @@ -289,7 +290,7 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; VI-GISEL-LABEL: s_log2_v2f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42000000 @@ -313,7 +314,7 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX900-SDAG-LABEL: s_log2_v2f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 @@ -336,7 +337,7 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX900-GISEL-LABEL: s_log2_v2f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x42000000 @@ -359,7 +360,7 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX1100-SDAG-LABEL: s_log2_v2f32: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s3 @@ -384,7 +385,7 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX1100-GISEL-LABEL: s_log2_v2f32: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s2 ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s5, 0x800000, s3 @@ -472,8 +473,8 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; SI-SDAG-LABEL: s_log2_v3f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000 @@ -504,8 +505,8 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; SI-GISEL-LABEL: s_log2_v3f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x42000000 @@ -536,8 +537,8 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; VI-SDAG-LABEL: s_log2_v3f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000 @@ -567,11 +568,11 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; VI-GISEL-LABEL: s_log2_v3f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x42000000 +; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc @@ -598,8 +599,8 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; GFX900-SDAG-LABEL: s_log2_v3f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000 @@ -623,16 +624,16 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v4, v2 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v6, v5 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v3, v0 -; GFX900-SDAG-NEXT: global_store_dwordx3 v7, v[0:2], s[2:3] +; GFX900-SDAG-NEXT: global_store_dwordx3 v7, v[0:2], s[0:1] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_log2_v3f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x42000000 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc @@ -658,32 +659,32 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; GFX1100-SDAG-LABEL: s_log2_v3f32: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s6 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, s6 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s1, 0x800000, s5 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s4 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s2 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 1.0, 0x4f800000, s3 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 1.0, 0x4f800000, s1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v5, 1.0, 0x4f800000, s7 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s2 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s3 -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v2, s6, v2 -; GFX1100-SDAG-NEXT: v_dual_mul_f32 v4, s5, v4 :: v_dual_mul_f32 v5, s4, v5 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s1 +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1100-SDAG-NEXT: v_dual_mul_f32 v2, s6, v2 :: v_dual_mul_f32 v5, s4, v5 +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v4, s5, v4 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, 0x42000000, s7 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, v2 -; GFX1100-SDAG-NEXT: v_log_f32_e32 v4, v4 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v5, v5 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_3) +; GFX1100-SDAG-NEXT: v_log_f32_e32 v4, v4 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v6, 0 +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v2, v2, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-SDAG-NEXT: v_dual_sub_f32 v2, v2, v0 :: v_dual_sub_f32 v1, v4, v1 -; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v5, v3 +; GFX1100-SDAG-NEXT: v_dual_sub_f32 v0, v5, v3 :: v_dual_sub_f32 v1, v4, v1 +; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: global_store_b96 v6, v[0:2], s[0:1] ; GFX1100-SDAG-NEXT: s_nop 0 ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -692,21 +693,21 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX1100-GISEL-LABEL: s_log2_v3f32: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s4 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s6 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s4 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s6 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s7 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s8 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s7 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, s3 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 0x42000000, s2 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s9 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, s8 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 0x42000000, s7 ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s7 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s9 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1 @@ -813,45 +814,45 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; SI-SDAG-LABEL: s_log2_v4f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000 -; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 1.0, v3, vcc -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v6, 1.0, v3, vcc -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s1, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v7, 0, v1, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v8, 1.0, v3, vcc -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v3, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v4, s3, v4 -; SI-SDAG-NEXT: v_mul_f32_e32 v6, s2, v6 -; SI-SDAG-NEXT: v_mul_f32_e32 v8, s1, v8 -; SI-SDAG-NEXT: v_mul_f32_e32 v1, s0, v1 +; SI-SDAG-NEXT: v_mul_f32_e32 v4, s7, v4 +; SI-SDAG-NEXT: v_mul_f32_e32 v6, s6, v6 +; SI-SDAG-NEXT: v_mul_f32_e32 v8, s5, v8 +; SI-SDAG-NEXT: v_mul_f32_e32 v1, s4, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v4, v4 ; SI-SDAG-NEXT: v_log_f32_e32 v6, v6 ; SI-SDAG-NEXT: v_log_f32_e32 v8, v8 ; SI-SDAG-NEXT: v_log_f32_e32 v9, v1 -; SI-SDAG-NEXT: s_mov_b32 s6, -1 +; SI-SDAG-NEXT: s_mov_b32 s2, -1 ; SI-SDAG-NEXT: v_sub_f32_e32 v3, v4, v2 ; SI-SDAG-NEXT: v_sub_f32_e32 v2, v6, v5 ; SI-SDAG-NEXT: v_sub_f32_e32 v1, v8, v7 ; SI-SDAG-NEXT: v_sub_f32_e32 v0, v9, v0 -; SI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-SDAG-NEXT: s_endpgm ; ; SI-GISEL-LABEL: s_log2_v4f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x42000000 @@ -887,8 +888,8 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; VI-SDAG-LABEL: s_log2_v4f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000 @@ -924,8 +925,8 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; VI-GISEL-LABEL: s_log2_v4f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x42000000 @@ -961,8 +962,8 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; GFX900-SDAG-LABEL: s_log2_v4f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000 @@ -992,13 +993,13 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v7, v6 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v9, v8 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v10, v0 -; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_log2_v4f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x42000000 @@ -1028,42 +1029,41 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX900-GISEL-NEXT: v_sub_f32_e32 v2, v5, v2 ; GFX900-GISEL-NEXT: v_sub_f32_e32 v3, v3, v4 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX900-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] ; GFX900-GISEL-NEXT: s_endpgm ; ; GFX1100-SDAG-LABEL: s_log2_v4f32: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s7 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s6 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, s7 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s1, 0x800000, s6 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s4 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s2 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s3 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s1 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v6, 1.0, 0x4f800000, s8 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v7, 1.0, 0x4f800000, s9 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s1 +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1100-SDAG-NEXT: v_dual_mul_f32 v2, s7, v2 :: v_dual_mul_f32 v3, s6, v3 ; GFX1100-SDAG-NEXT: v_dual_mul_f32 v6, s5, v6 :: v_dual_mul_f32 v7, s4, v7 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s3 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, s8 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, v2 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v8, v3 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(TRANS32_DEP_3) +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(TRANS32_DEP_3) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v6, v6 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v7, v7 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, s8 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s9 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v9, 0 ; GFX1100-SDAG-NEXT: v_dual_sub_f32 v3, v2, v0 :: v_dual_sub_f32 v2, v8, v1 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_dual_sub_f32 v1, v6, v4 :: v_dual_sub_f32 v0, v7, v5 +; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: global_store_b128 v9, v[0:3], s[0:1] ; GFX1100-SDAG-NEXT: s_nop 0 ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1072,32 +1072,32 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX1100-GISEL-LABEL: s_log2_v4f32: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s4 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s6 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s7 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s4 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s5 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s10, 0x800000, s6 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s11, 0x800000, s7 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s8 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s9 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s8 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s9 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, s2 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s10 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s11 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, s8 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1 ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v2, s6, v2 :: v_dual_mul_f32 v3, s7, v3 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s3 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s9 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(TRANS32_DEP_3) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v2, v2 ; GFX1100-GISEL-NEXT: v_log_f32_e32 v3, v3 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 0x42000000, s8 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 0x42000000, s9 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 0x42000000, s10 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 0x42000000, s11 ; GFX1100-GISEL-NEXT: v_dual_sub_f32 v0, v0, v4 :: v_dual_sub_f32 v1, v1, v5 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_dual_sub_f32 v2, v2, v6 :: v_dual_sub_f32 v3, v3, v7 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll b/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll index 826862e124920..3d73f84b6e9a8 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll @@ -6,8 +6,8 @@ define amdgpu_kernel void @local_size_x(ptr addrspace(1) %out) { ; SI-LABEL: local_size_x: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0x6 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0x6 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -17,12 +17,12 @@ define amdgpu_kernel void @local_size_x(ptr addrspace(1) %out) { ; ; VI-LABEL: local_size_x: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x18 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x18 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -45,8 +45,8 @@ entry: define amdgpu_kernel void @local_size_y(ptr addrspace(1) %out) { ; SI-LABEL: local_size_y: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0x7 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0x7 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -56,12 +56,12 @@ define amdgpu_kernel void @local_size_y(ptr addrspace(1) %out) { ; ; VI-LABEL: local_size_y: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x1c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x1c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -84,8 +84,8 @@ entry: define amdgpu_kernel void @local_size_z(ptr addrspace(1) %out) { ; SI-LABEL: local_size_z: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0x8 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0x8 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -95,12 +95,12 @@ define amdgpu_kernel void @local_size_z(ptr addrspace(1) %out) { ; ; VI-LABEL: local_size_z: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x20 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x20 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -123,8 +123,8 @@ entry: define amdgpu_kernel void @local_size_xy(ptr addrspace(1) %out) { ; SI-LABEL: local_size_xy: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x6 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x6 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mul_i32 s4, s4, s5 @@ -135,13 +135,13 @@ define amdgpu_kernel void @local_size_xy(ptr addrspace(1) %out) { ; ; VI-LABEL: local_size_xy: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x18 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x18 +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mul_i32 s2, s2, s3 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_mul_i32 s0, s0, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -166,12 +166,12 @@ entry: define amdgpu_kernel void @local_size_xz(ptr addrspace(1) %out) { ; SI-LABEL: local_size_xz: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s2, s[0:1], 0x6 -; SI-NEXT: s_load_dword s4, s[0:1], 0x8 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0x6 +; SI-NEXT: s_load_dword s5, s[2:3], 0x8 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mul_i32 s4, s2, s4 +; SI-NEXT: s_mul_i32 s4, s4, s5 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -179,11 +179,11 @@ define amdgpu_kernel void @local_size_xz(ptr addrspace(1) %out) { ; ; VI-LABEL: local_size_xz: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s2, s[0:1], 0x18 -; VI-NEXT: s_load_dword s3, s[0:1], 0x20 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x18 +; VI-NEXT: s_load_dword s5, s[2:3], 0x20 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mul_i32 s2, s2, s3 +; VI-NEXT: s_mul_i32 s2, s4, s5 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -211,7 +211,7 @@ entry: define amdgpu_kernel void @local_size_yz(ptr addrspace(1) %out) { ; SI-LABEL: local_size_yz: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x7 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x7 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mul_i32 s0, s0, s1 @@ -224,7 +224,7 @@ define amdgpu_kernel void @local_size_yz(ptr addrspace(1) %out) { ; ; VI-LABEL: local_size_yz: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x1c +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x1c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mul_i32 s0, s0, s1 ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -254,13 +254,13 @@ entry: define amdgpu_kernel void @local_size_xyz(ptr addrspace(1) %out) { ; SI-LABEL: local_size_xyz: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x6 -; SI-NEXT: s_load_dword s2, s[0:1], 0x8 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x6 +; SI-NEXT: s_load_dword s6, s[2:3], 0x8 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mul_i32 s4, s4, s5 -; SI-NEXT: s_add_i32 s4, s4, s2 +; SI-NEXT: s_mul_i32 s2, s4, s5 +; SI-NEXT: s_add_i32 s4, s2, s6 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -268,15 +268,15 @@ define amdgpu_kernel void @local_size_xyz(ptr addrspace(1) %out) { ; ; VI-LABEL: local_size_xyz: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x18 -; VI-NEXT: s_load_dword s4, s[0:1], 0x20 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x18 +; VI-NEXT: s_load_dword s4, s[2:3], 0x20 +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mul_i32 s2, s2, s3 -; VI-NEXT: s_add_i32 s2, s2, s4 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_mul_i32 s0, s0, s1 +; VI-NEXT: s_add_i32 s0, s0, s4 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -304,8 +304,8 @@ entry: define amdgpu_kernel void @local_size_x_known_bits(ptr addrspace(1) %out) { ; SI-LABEL: local_size_x_known_bits: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0x6 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0x6 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -315,12 +315,12 @@ define amdgpu_kernel void @local_size_x_known_bits(ptr addrspace(1) %out) { ; ; VI-LABEL: local_size_x_known_bits: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x18 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x18 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -345,8 +345,8 @@ entry: define amdgpu_kernel void @local_size_y_known_bits(ptr addrspace(1) %out) { ; SI-LABEL: local_size_y_known_bits: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0x7 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0x7 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -356,12 +356,12 @@ define amdgpu_kernel void @local_size_y_known_bits(ptr addrspace(1) %out) { ; ; VI-LABEL: local_size_y_known_bits: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x1c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x1c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -386,8 +386,8 @@ entry: define amdgpu_kernel void @local_size_z_known_bits(ptr addrspace(1) %out) { ; SI-LABEL: local_size_z_known_bits: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0x8 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0x8 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -397,12 +397,12 @@ define amdgpu_kernel void @local_size_z_known_bits(ptr addrspace(1) %out) { ; ; VI-LABEL: local_size_z_known_bits: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x20 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x20 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.ll index 8196999b8f1f1..9327d76e50692 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.round.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.round.ll @@ -8,8 +8,8 @@ define amdgpu_kernel void @round_f32(ptr addrspace(1) %out, float %x) #0 { ; GFX6-LABEL: round_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s6, s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s6, s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -24,57 +24,39 @@ define amdgpu_kernel void @round_f32(ptr addrspace(1) %out, float %x) #0 { ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX8-LABEL: round_f32: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s6, s[0:1], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_mov_b32 s3, 0xf000 -; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_trunc_f32_e32 v0, s6 -; GFX8-NEXT: v_sub_f32_e32 v1, s6, v0 -; GFX8-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5 -; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5] -; GFX8-NEXT: s_brev_b32 s4, -2 -; GFX8-NEXT: v_mov_b32_e32 v2, s6 -; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v2 -; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX8-NEXT: s_endpgm -; -; GFX9-LABEL: round_f32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_trunc_f32_e32 v0, s2 -; GFX9-NEXT: v_sub_f32_e32 v1, s2, v0 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] -; GFX9-NEXT: s_brev_b32 s0, -2 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_bfi_b32 v1, s0, v1, v2 -; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; GFX9-NEXT: s_endpgm +; GFX89-LABEL: round_f32: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX89-NEXT: s_mov_b32 s3, 0xf000 +; GFX89-NEXT: s_mov_b32 s2, -1 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: v_trunc_f32_e32 v0, s6 +; GFX89-NEXT: v_sub_f32_e32 v1, s6, v0 +; GFX89-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5 +; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5] +; GFX89-NEXT: s_brev_b32 s4, -2 +; GFX89-NEXT: v_mov_b32_e32 v2, s6 +; GFX89-NEXT: v_bfi_b32 v1, s4, v1, v2 +; GFX89-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX89-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX89-NEXT: s_endpgm ; ; GFX11-LABEL: round_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_trunc_f32_e32 v0, s2 +; GFX11-NEXT: v_trunc_f32_e32 v0, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_sub_f32_e32 v1, s2, v0 -; GFX11-NEXT: v_cmp_ge_f32_e64 s3, |v1|, 0.5 +; GFX11-NEXT: v_sub_f32_e32 v1, s4, v0 +; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v1|, 0.5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s3 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s2 ; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -109,7 +91,7 @@ define amdgpu_kernel void @round_f32(ptr addrspace(1) %out, float %x) #0 { define amdgpu_kernel void @round_v2f32(ptr addrspace(1) %out, <2 x float> %in) #0 { ; GFX6-LABEL: round_v2f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_brev_b32 s8, -2 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 @@ -135,7 +117,7 @@ define amdgpu_kernel void @round_v2f32(ptr addrspace(1) %out, <2 x float> %in) # ; ; GFX8-LABEL: round_v2f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_brev_b32 s8, -2 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 @@ -161,7 +143,7 @@ define amdgpu_kernel void @round_v2f32(ptr addrspace(1) %out, <2 x float> %in) # ; ; GFX9-LABEL: round_v2f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: s_brev_b32 s8, -2 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 @@ -187,7 +169,7 @@ define amdgpu_kernel void @round_v2f32(ptr addrspace(1) %out, <2 x float> %in) # ; ; GFX11-LABEL: round_v2f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_trunc_f32_e32 v0, s3 ; GFX11-NEXT: v_trunc_f32_e32 v2, s2 @@ -242,8 +224,8 @@ define amdgpu_kernel void @round_v2f32(ptr addrspace(1) %out, <2 x float> %in) # define amdgpu_kernel void @round_v4f32(ptr addrspace(1) %out, <4 x float> %in) #0 { ; GFX6-LABEL: round_v4f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_brev_b32 s10, -2 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -279,89 +261,50 @@ define amdgpu_kernel void @round_v4f32(ptr addrspace(1) %out, <4 x float> %in) # ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX8-LABEL: round_v4f32: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_brev_b32 s10, -2 -; GFX8-NEXT: s_mov_b32 s3, 0xf000 -; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_trunc_f32_e32 v0, s7 -; GFX8-NEXT: v_sub_f32_e32 v1, s7, v0 -; GFX8-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5 -; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[8:9] -; GFX8-NEXT: v_mov_b32_e32 v2, s7 -; GFX8-NEXT: v_bfi_b32 v1, s10, v1, v2 -; GFX8-NEXT: v_add_f32_e32 v3, v0, v1 -; GFX8-NEXT: v_trunc_f32_e32 v0, s6 -; GFX8-NEXT: v_sub_f32_e32 v1, s6, v0 -; GFX8-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5 -; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[8:9] -; GFX8-NEXT: v_mov_b32_e32 v2, s6 -; GFX8-NEXT: v_bfi_b32 v1, s10, v1, v2 -; GFX8-NEXT: v_add_f32_e32 v2, v0, v1 -; GFX8-NEXT: v_trunc_f32_e32 v0, s5 -; GFX8-NEXT: v_sub_f32_e32 v1, s5, v0 -; GFX8-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, 0.5 -; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v4, s5 -; GFX8-NEXT: v_bfi_b32 v1, s10, v1, v4 -; GFX8-NEXT: v_add_f32_e32 v1, v0, v1 -; GFX8-NEXT: v_trunc_f32_e32 v0, s4 -; GFX8-NEXT: v_sub_f32_e32 v4, s4, v0 -; GFX8-NEXT: v_cmp_ge_f32_e64 s[6:7], |v4|, 0.5 -; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v5, s4 -; GFX8-NEXT: v_bfi_b32 v4, s10, v4, v5 -; GFX8-NEXT: v_add_f32_e32 v0, v0, v4 -; GFX8-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 -; GFX8-NEXT: s_endpgm -; -; GFX9-LABEL: round_v4f32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 -; GFX9-NEXT: s_brev_b32 s2, -2 -; GFX9-NEXT: s_mov_b32 s11, 0xf000 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_trunc_f32_e32 v0, s7 -; GFX9-NEXT: v_sub_f32_e32 v1, s7, v0 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-NEXT: v_bfi_b32 v1, s2, v1, v2 -; GFX9-NEXT: v_add_f32_e32 v3, v0, v1 -; GFX9-NEXT: v_trunc_f32_e32 v0, s6 -; GFX9-NEXT: v_sub_f32_e32 v1, s6, v0 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_bfi_b32 v1, s2, v1, v2 -; GFX9-NEXT: v_add_f32_e32 v2, v0, v1 -; GFX9-NEXT: v_trunc_f32_e32 v0, s5 -; GFX9-NEXT: v_sub_f32_e32 v1, s5, v0 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-NEXT: v_bfi_b32 v1, s2, v1, v4 -; GFX9-NEXT: v_add_f32_e32 v1, v0, v1 -; GFX9-NEXT: v_trunc_f32_e32 v0, s4 -; GFX9-NEXT: v_sub_f32_e32 v4, s4, v0 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, 0.5 -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v5, s4 -; GFX9-NEXT: v_bfi_b32 v4, s2, v4, v5 -; GFX9-NEXT: v_add_f32_e32 v0, v0, v4 -; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 -; GFX9-NEXT: s_endpgm +; GFX89-LABEL: round_v4f32: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX89-NEXT: s_brev_b32 s10, -2 +; GFX89-NEXT: s_mov_b32 s3, 0xf000 +; GFX89-NEXT: s_mov_b32 s2, -1 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: v_trunc_f32_e32 v0, s7 +; GFX89-NEXT: v_sub_f32_e32 v1, s7, v0 +; GFX89-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5 +; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[8:9] +; GFX89-NEXT: v_mov_b32_e32 v2, s7 +; GFX89-NEXT: v_bfi_b32 v1, s10, v1, v2 +; GFX89-NEXT: v_add_f32_e32 v3, v0, v1 +; GFX89-NEXT: v_trunc_f32_e32 v0, s6 +; GFX89-NEXT: v_sub_f32_e32 v1, s6, v0 +; GFX89-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5 +; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[8:9] +; GFX89-NEXT: v_mov_b32_e32 v2, s6 +; GFX89-NEXT: v_bfi_b32 v1, s10, v1, v2 +; GFX89-NEXT: v_add_f32_e32 v2, v0, v1 +; GFX89-NEXT: v_trunc_f32_e32 v0, s5 +; GFX89-NEXT: v_sub_f32_e32 v1, s5, v0 +; GFX89-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, 0.5 +; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[6:7] +; GFX89-NEXT: v_mov_b32_e32 v4, s5 +; GFX89-NEXT: v_bfi_b32 v1, s10, v1, v4 +; GFX89-NEXT: v_add_f32_e32 v1, v0, v1 +; GFX89-NEXT: v_trunc_f32_e32 v0, s4 +; GFX89-NEXT: v_sub_f32_e32 v4, s4, v0 +; GFX89-NEXT: v_cmp_ge_f32_e64 s[6:7], |v4|, 0.5 +; GFX89-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[6:7] +; GFX89-NEXT: v_mov_b32_e32 v5, s4 +; GFX89-NEXT: v_bfi_b32 v4, s10, v4, v5 +; GFX89-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX89-NEXT: s_endpgm ; ; GFX11-LABEL: round_v4f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_trunc_f32_e32 v0, s7 @@ -438,145 +381,145 @@ define amdgpu_kernel void @round_v4f32(ptr addrspace(1) %out, <4 x float> %in) # define amdgpu_kernel void @round_v8f32(ptr addrspace(1) %out, <8 x float> %in) #0 { ; GFX6-LABEL: round_v8f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 -; GFX6-NEXT: s_brev_b32 s14, -2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x11 +; GFX6-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 +; GFX6-NEXT: s_brev_b32 s2, -2 +; GFX6-NEXT: s_mov_b32 s15, 0xf000 +; GFX6-NEXT: s_mov_b32 s14, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_trunc_f32_e32 v0, s7 ; GFX6-NEXT: v_sub_f32_e32 v1, s7, v0 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[12:13], |v1|, 0.5 -; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[12:13] +; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 +; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] ; GFX6-NEXT: v_mov_b32_e32 v2, s7 -; GFX6-NEXT: v_bfi_b32 v1, s14, v1, v2 +; GFX6-NEXT: v_bfi_b32 v1, s2, v1, v2 ; GFX6-NEXT: v_add_f32_e32 v3, v0, v1 ; GFX6-NEXT: v_trunc_f32_e32 v0, s6 ; GFX6-NEXT: v_sub_f32_e32 v1, s6, v0 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[12:13], |v1|, 0.5 -; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[12:13] +; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 +; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] ; GFX6-NEXT: v_mov_b32_e32 v2, s6 -; GFX6-NEXT: v_bfi_b32 v1, s14, v1, v2 +; GFX6-NEXT: v_bfi_b32 v1, s2, v1, v2 ; GFX6-NEXT: v_add_f32_e32 v2, v0, v1 ; GFX6-NEXT: v_trunc_f32_e32 v0, s5 ; GFX6-NEXT: v_sub_f32_e32 v1, s5, v0 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, 0.5 -; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[6:7] +; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 +; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] ; GFX6-NEXT: v_mov_b32_e32 v4, s5 -; GFX6-NEXT: v_bfi_b32 v1, s14, v1, v4 +; GFX6-NEXT: v_bfi_b32 v1, s2, v1, v4 ; GFX6-NEXT: v_add_f32_e32 v1, v0, v1 ; GFX6-NEXT: v_trunc_f32_e32 v0, s4 ; GFX6-NEXT: v_sub_f32_e32 v4, s4, v0 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[6:7], |v4|, 0.5 -; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[6:7] +; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, 0.5 +; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[0:1] ; GFX6-NEXT: v_mov_b32_e32 v5, s4 -; GFX6-NEXT: v_bfi_b32 v4, s14, v4, v5 +; GFX6-NEXT: v_bfi_b32 v4, s2, v4, v5 ; GFX6-NEXT: v_add_f32_e32 v0, v0, v4 ; GFX6-NEXT: v_trunc_f32_e32 v4, s11 ; GFX6-NEXT: v_sub_f32_e32 v5, s11, v4 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5 -; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5] +; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5 +; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1] ; GFX6-NEXT: v_mov_b32_e32 v6, s11 -; GFX6-NEXT: v_bfi_b32 v5, s14, v5, v6 +; GFX6-NEXT: v_bfi_b32 v5, s2, v5, v6 ; GFX6-NEXT: v_add_f32_e32 v7, v4, v5 ; GFX6-NEXT: v_trunc_f32_e32 v4, s10 ; GFX6-NEXT: v_sub_f32_e32 v5, s10, v4 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5 -; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5] +; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5 +; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1] ; GFX6-NEXT: v_mov_b32_e32 v6, s10 -; GFX6-NEXT: v_bfi_b32 v5, s14, v5, v6 +; GFX6-NEXT: v_bfi_b32 v5, s2, v5, v6 ; GFX6-NEXT: v_add_f32_e32 v6, v4, v5 ; GFX6-NEXT: v_trunc_f32_e32 v4, s9 ; GFX6-NEXT: v_sub_f32_e32 v5, s9, v4 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5 -; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5] +; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5 +; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1] ; GFX6-NEXT: v_mov_b32_e32 v8, s9 -; GFX6-NEXT: v_bfi_b32 v5, s14, v5, v8 +; GFX6-NEXT: v_bfi_b32 v5, s2, v5, v8 ; GFX6-NEXT: v_add_f32_e32 v5, v4, v5 ; GFX6-NEXT: v_trunc_f32_e32 v4, s8 ; GFX6-NEXT: v_sub_f32_e32 v8, s8, v4 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v8|, 0.5 -; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, 1.0, s[4:5] +; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v8|, 0.5 +; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, 1.0, s[0:1] ; GFX6-NEXT: v_mov_b32_e32 v9, s8 -; GFX6-NEXT: v_bfi_b32 v8, s14, v8, v9 +; GFX6-NEXT: v_bfi_b32 v8, s2, v8, v9 ; GFX6-NEXT: v_add_f32_e32 v4, v4, v8 -; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 ; GFX6-NEXT: s_endpgm ; ; GFX89-LABEL: round_v8f32: ; GFX89: ; %bb.0: -; GFX89-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 -; GFX89-NEXT: s_brev_b32 s14, -2 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX89-NEXT: s_mov_b32 s3, 0xf000 -; GFX89-NEXT: s_mov_b32 s2, -1 +; GFX89-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 +; GFX89-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x24 +; GFX89-NEXT: s_brev_b32 s2, -2 +; GFX89-NEXT: s_mov_b32 s15, 0xf000 +; GFX89-NEXT: s_mov_b32 s14, -1 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_trunc_f32_e32 v0, s7 ; GFX89-NEXT: v_sub_f32_e32 v1, s7, v0 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[12:13], |v1|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[12:13] +; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 +; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] ; GFX89-NEXT: v_mov_b32_e32 v2, s7 -; GFX89-NEXT: v_bfi_b32 v1, s14, v1, v2 +; GFX89-NEXT: v_bfi_b32 v1, s2, v1, v2 ; GFX89-NEXT: v_add_f32_e32 v3, v0, v1 ; GFX89-NEXT: v_trunc_f32_e32 v0, s6 ; GFX89-NEXT: v_sub_f32_e32 v1, s6, v0 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[12:13], |v1|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[12:13] +; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 +; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] ; GFX89-NEXT: v_mov_b32_e32 v2, s6 -; GFX89-NEXT: v_bfi_b32 v1, s14, v1, v2 +; GFX89-NEXT: v_bfi_b32 v1, s2, v1, v2 ; GFX89-NEXT: v_add_f32_e32 v2, v0, v1 ; GFX89-NEXT: v_trunc_f32_e32 v0, s5 ; GFX89-NEXT: v_sub_f32_e32 v1, s5, v0 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[6:7] +; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 +; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] ; GFX89-NEXT: v_mov_b32_e32 v4, s5 -; GFX89-NEXT: v_bfi_b32 v1, s14, v1, v4 +; GFX89-NEXT: v_bfi_b32 v1, s2, v1, v4 ; GFX89-NEXT: v_add_f32_e32 v1, v0, v1 ; GFX89-NEXT: v_trunc_f32_e32 v0, s4 ; GFX89-NEXT: v_sub_f32_e32 v4, s4, v0 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[6:7], |v4|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[6:7] +; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, 0.5 +; GFX89-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[0:1] ; GFX89-NEXT: v_mov_b32_e32 v5, s4 -; GFX89-NEXT: v_bfi_b32 v4, s14, v4, v5 +; GFX89-NEXT: v_bfi_b32 v4, s2, v4, v5 ; GFX89-NEXT: v_add_f32_e32 v0, v0, v4 ; GFX89-NEXT: v_trunc_f32_e32 v4, s11 ; GFX89-NEXT: v_sub_f32_e32 v5, s11, v4 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5] +; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5 +; GFX89-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1] ; GFX89-NEXT: v_mov_b32_e32 v6, s11 -; GFX89-NEXT: v_bfi_b32 v5, s14, v5, v6 +; GFX89-NEXT: v_bfi_b32 v5, s2, v5, v6 ; GFX89-NEXT: v_add_f32_e32 v7, v4, v5 ; GFX89-NEXT: v_trunc_f32_e32 v4, s10 ; GFX89-NEXT: v_sub_f32_e32 v5, s10, v4 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5] +; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5 +; GFX89-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1] ; GFX89-NEXT: v_mov_b32_e32 v6, s10 -; GFX89-NEXT: v_bfi_b32 v5, s14, v5, v6 +; GFX89-NEXT: v_bfi_b32 v5, s2, v5, v6 ; GFX89-NEXT: v_add_f32_e32 v6, v4, v5 ; GFX89-NEXT: v_trunc_f32_e32 v4, s9 ; GFX89-NEXT: v_sub_f32_e32 v5, s9, v4 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5] +; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5 +; GFX89-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1] ; GFX89-NEXT: v_mov_b32_e32 v8, s9 -; GFX89-NEXT: v_bfi_b32 v5, s14, v5, v8 +; GFX89-NEXT: v_bfi_b32 v5, s2, v5, v8 ; GFX89-NEXT: v_add_f32_e32 v5, v4, v5 ; GFX89-NEXT: v_trunc_f32_e32 v4, s8 ; GFX89-NEXT: v_sub_f32_e32 v8, s8, v4 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[4:5], |v8|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v8, 0, 1.0, s[4:5] +; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v8|, 0.5 +; GFX89-NEXT: v_cndmask_b32_e64 v8, 0, 1.0, s[0:1] ; GFX89-NEXT: v_mov_b32_e32 v9, s8 -; GFX89-NEXT: v_bfi_b32 v8, s14, v8, v9 +; GFX89-NEXT: v_bfi_b32 v8, s2, v8, v9 ; GFX89-NEXT: v_add_f32_e32 v4, v4, v8 -; GFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 -; GFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16 +; GFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 ; GFX89-NEXT: s_endpgm ; ; GFX11-LABEL: round_v8f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x44 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x44 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_trunc_f32_e32 v0, s7 @@ -711,10 +654,10 @@ define amdgpu_kernel void @round_v8f32(ptr addrspace(1) %out, <8 x float> %in) # define amdgpu_kernel void @round_f16(ptr addrspace(1) %out, i32 %x.arg) #0 { ; GFX6-LABEL: round_f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s2, s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[2:3], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, s2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, s0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: v_trunc_f32_e32 v1, v0 ; GFX6-NEXT: v_sub_f32_e32 v2, v0, v1 ; GFX6-NEXT: v_cmp_ge_f32_e64 s[2:3], |v2|, 0.5 @@ -725,62 +668,44 @@ define amdgpu_kernel void @round_f16(ptr addrspace(1) %out, i32 %x.arg) #0 { ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX8-LABEL: round_f16: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: v_mov_b32_e32 v0, 0x3c00 -; GFX8-NEXT: s_movk_i32 s5, 0x7fff -; GFX8-NEXT: s_mov_b32 s3, 0xf000 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_trunc_f16_e32 v1, s4 -; GFX8-NEXT: v_sub_f16_e32 v2, s4, v1 -; GFX8-NEXT: v_cmp_ge_f16_e64 vcc, |v2|, 0.5 -; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_bfi_b32 v0, s5, v0, v2 -; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_add_f16_e32 v0, v1, v0 -; GFX8-NEXT: buffer_store_short v0, off, s[0:3], 0 -; GFX8-NEXT: s_endpgm -; -; GFX9-LABEL: round_f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v0, 0x3c00 -; GFX9-NEXT: s_movk_i32 s0, 0x7fff -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_trunc_f16_e32 v1, s2 -; GFX9-NEXT: v_sub_f16_e32 v2, s2, v1 -; GFX9-NEXT: v_cmp_ge_f16_e64 vcc, |v2|, 0.5 -; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_bfi_b32 v0, s0, v0, v2 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: v_add_f16_e32 v0, v1, v0 -; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0 -; GFX9-NEXT: s_endpgm +; GFX89-LABEL: round_f16: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX89-NEXT: v_mov_b32_e32 v0, 0x3c00 +; GFX89-NEXT: s_movk_i32 s5, 0x7fff +; GFX89-NEXT: s_mov_b32 s3, 0xf000 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: v_trunc_f16_e32 v1, s4 +; GFX89-NEXT: v_sub_f16_e32 v2, s4, v1 +; GFX89-NEXT: v_cmp_ge_f16_e64 vcc, |v2|, 0.5 +; GFX89-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX89-NEXT: v_mov_b32_e32 v2, s4 +; GFX89-NEXT: v_bfi_b32 v0, s5, v0, v2 +; GFX89-NEXT: s_mov_b32 s2, -1 +; GFX89-NEXT: v_add_f16_e32 v0, v1, v0 +; GFX89-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX89-NEXT: s_endpgm ; ; GFX11-LABEL: round_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_trunc_f16_e32 v0, s2 +; GFX11-NEXT: v_trunc_f16_e32 v0, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_sub_f16_e32 v1, s2, v0 -; GFX11-NEXT: v_cmp_ge_f16_e64 s3, |v1|, 0.5 +; GFX11-NEXT: v_sub_f16_e32 v1, s4, v0 +; GFX11-NEXT: v_cmp_ge_f16_e64 s2, |v1|, 0.5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x3c00, s3 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, v1, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x3c00, s2 ; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, v1, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_f16_e32 v0, v0, v1 ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 @@ -824,13 +749,13 @@ define amdgpu_kernel void @round_f16(ptr addrspace(1) %out, i32 %x.arg) #0 { define amdgpu_kernel void @round_v2f16(ptr addrspace(1) %out, i32 %in.arg) #0 { ; GFX6-LABEL: round_v2f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s2, s[0:1], 0xb +; GFX6-NEXT: s_load_dword s0, s[2:3], 0xb ; GFX6-NEXT: s_brev_b32 s4, -2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshr_b32 s3, s2, 16 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, s3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, s2 +; GFX6-NEXT: s_lshr_b32 s1, s0, 16 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, s1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, s0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: v_trunc_f32_e32 v3, v1 ; GFX6-NEXT: v_sub_f32_e32 v5, v1, v3 ; GFX6-NEXT: v_trunc_f32_e32 v2, v0 @@ -849,13 +774,14 @@ define amdgpu_kernel void @round_v2f16(ptr addrspace(1) %out, i32 %in.arg) #0 { ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: round_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v0, 0x3c00 ; GFX8-NEXT: s_movk_i32 s6, 0x7fff ; GFX8-NEXT: s_mov_b32 s3, 0xf000 @@ -882,57 +808,57 @@ define amdgpu_kernel void @round_v2f16(ptr addrspace(1) %out, i32 %in.arg) #0 { ; ; GFX9-LABEL: round_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x3c00 -; GFX9-NEXT: s_movk_i32 s1, 0x7fff -; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_movk_i32 s6, 0x7fff +; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s0, s2, 16 -; GFX9-NEXT: v_trunc_f16_e32 v1, s0 -; GFX9-NEXT: v_sub_f16_e32 v2, s0, v1 +; GFX9-NEXT: s_lshr_b32 s5, s4, 16 +; GFX9-NEXT: v_trunc_f16_e32 v1, s5 +; GFX9-NEXT: v_sub_f16_e32 v2, s5, v1 ; GFX9-NEXT: v_cmp_ge_f16_e64 vcc, |v2|, 0.5 ; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v0, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: v_bfi_b32 v2, s1, v2, v3 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: v_bfi_b32 v2, s6, v2, v3 ; GFX9-NEXT: v_add_f16_e32 v1, v1, v2 -; GFX9-NEXT: v_trunc_f16_e32 v2, s2 -; GFX9-NEXT: v_sub_f16_e32 v3, s2, v2 +; GFX9-NEXT: v_trunc_f16_e32 v2, s4 +; GFX9-NEXT: v_sub_f16_e32 v3, s4, v2 ; GFX9-NEXT: v_cmp_ge_f16_e64 vcc, |v3|, 0.5 ; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NEXT: v_bfi_b32 v0, s1, v0, v3 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_bfi_b32 v0, s6, v0, v3 ; GFX9-NEXT: v_add_f16_e32 v0, v2, v0 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: round_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s3, s2, 16 -; GFX11-NEXT: v_trunc_f16_e32 v1, s2 -; GFX11-NEXT: v_trunc_f16_e32 v0, s3 +; GFX11-NEXT: s_lshr_b32 s5, s4, 16 +; GFX11-NEXT: v_trunc_f16_e32 v1, s4 +; GFX11-NEXT: v_trunc_f16_e32 v0, s5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_sub_f16_e32 v3, s2, v1 -; GFX11-NEXT: v_sub_f16_e32 v2, s3, v0 +; GFX11-NEXT: v_sub_f16_e32 v3, s4, v1 +; GFX11-NEXT: v_sub_f16_e32 v2, s5, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cmp_ge_f16_e64 s4, |v2|, 0.5 -; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x3c00, s4 +; GFX11-NEXT: v_cmp_ge_f16_e64 s2, |v2|, 0.5 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x3c00, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_ge_f16_e64 s4, |v3|, 0.5 -; GFX11-NEXT: v_bfi_b32 v2, 0x7fff, v2, s3 +; GFX11-NEXT: v_cmp_ge_f16_e64 s2, |v3|, 0.5 +; GFX11-NEXT: v_bfi_b32 v2, 0x7fff, v2, s5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 0x3c00, s4 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-NEXT: v_add_f16_e32 v0, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_bfi_b32 v3, 0x7fff, v3, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 0x3c00, s2 ; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: v_add_f16_e32 v0, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_bfi_b32 v3, 0x7fff, v3, s4 ; GFX11-NEXT: v_add_f16_e32 v1, v1, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0 diff --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll index a54405bf1b471..48df9a0d98d6b 100644 --- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 { ; GFX9-LABEL: s_lshr_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -18,7 +18,7 @@ define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, < ; ; VI-LABEL: s_lshr_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s4, s2, 0xffff ; VI-NEXT: s_lshr_b32 s2, s2, 16 @@ -35,7 +35,7 @@ define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, < ; ; CI-LABEL: s_lshr_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -54,7 +54,7 @@ define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, < ; ; GFX10-LABEL: s_lshr_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_pk_lshrrev_b16 v1, s7, s6 @@ -63,7 +63,7 @@ define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, < ; ; GFX11-LABEL: s_lshr_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_lshrrev_b16 v1, s3, s2 @@ -79,7 +79,7 @@ define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, < define amdgpu_kernel void @v_lshr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: v_lshr_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -90,7 +90,7 @@ define amdgpu_kernel void @v_lshr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; VI-LABEL: v_lshr_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -109,7 +109,7 @@ define amdgpu_kernel void @v_lshr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; CI-LABEL: v_lshr_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -131,7 +131,7 @@ define amdgpu_kernel void @v_lshr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX10-LABEL: v_lshr_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -142,7 +142,9 @@ define amdgpu_kernel void @v_lshr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX11-LABEL: v_lshr_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] @@ -167,20 +169,20 @@ define amdgpu_kernel void @v_lshr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @lshr_v_s_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in, <2 x i16> %sgpr) #0 { ; GFX9-LABEL: lshr_v_s_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshrrev_b16 v1, s2, v1 +; GFX9-NEXT: v_pk_lshrrev_b16 v1, s0, v1 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: lshr_v_s_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -201,22 +203,22 @@ define amdgpu_kernel void @lshr_v_s_v2i16(ptr addrspace(1) %out, ptr addrspace(1 ; ; CI-LABEL: lshr_v_s_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dword s8, s[0:1], 0xd -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dword s0, s[2:3], 0xd +; CI-NEXT: s_mov_b32 s11, 0xf000 +; CI-NEXT: s_mov_b32 s10, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[0:1], s[6:7] +; CI-NEXT: s_mov_b64 s[8:9], s[6:7] ; CI-NEXT: v_mov_b32_e32 v1, 0 -; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 -; CI-NEXT: s_lshr_b32 s0, s8, 16 -; CI-NEXT: s_mov_b64 s[6:7], s[2:3] +; CI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; CI-NEXT: s_lshr_b32 s1, s0, 16 +; CI-NEXT: s_mov_b64 s[6:7], s[10:11] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; CI-NEXT: v_lshrrev_b32_e32 v3, s0, v3 -; CI-NEXT: v_lshrrev_b32_e32 v2, s8, v2 +; CI-NEXT: v_lshrrev_b32_e32 v3, s1, v3 +; CI-NEXT: v_lshrrev_b32_e32 v2, s0, v2 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 @@ -224,9 +226,10 @@ define amdgpu_kernel void @lshr_v_s_v2i16(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX10-LABEL: lshr_v_s_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -236,9 +239,12 @@ define amdgpu_kernel void @lshr_v_s_v2i16(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX11-LABEL: lshr_v_s_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -260,20 +266,20 @@ define amdgpu_kernel void @lshr_v_s_v2i16(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @lshr_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in, <2 x i16> %sgpr) #0 { ; GFX9-LABEL: lshr_s_v_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshrrev_b16 v1, v1, s2 +; GFX9-NEXT: v_pk_lshrrev_b16 v1, v1, s0 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: lshr_s_v_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -294,22 +300,22 @@ define amdgpu_kernel void @lshr_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1 ; ; CI-LABEL: lshr_s_v_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dword s8, s[0:1], 0xd -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dword s0, s[2:3], 0xd +; CI-NEXT: s_mov_b32 s11, 0xf000 +; CI-NEXT: s_mov_b32 s10, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[0:1], s[6:7] +; CI-NEXT: s_mov_b64 s[8:9], s[6:7] ; CI-NEXT: v_mov_b32_e32 v1, 0 -; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 -; CI-NEXT: s_lshr_b32 s0, s8, 16 -; CI-NEXT: s_and_b32 s1, s8, 0xffff -; CI-NEXT: s_mov_b64 s[6:7], s[2:3] +; CI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; CI-NEXT: s_lshr_b32 s1, s0, 16 +; CI-NEXT: s_and_b32 s0, s0, 0xffff +; CI-NEXT: s_mov_b64 s[6:7], s[10:11] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; CI-NEXT: v_lshr_b32_e32 v3, s0, v3 -; CI-NEXT: v_lshr_b32_e32 v2, s1, v2 +; CI-NEXT: v_lshr_b32_e32 v3, s1, v3 +; CI-NEXT: v_lshr_b32_e32 v2, s0, v2 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 @@ -317,9 +323,10 @@ define amdgpu_kernel void @lshr_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX10-LABEL: lshr_s_v_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -329,9 +336,12 @@ define amdgpu_kernel void @lshr_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX11-LABEL: lshr_s_v_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -353,7 +363,7 @@ define amdgpu_kernel void @lshr_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @lshr_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: lshr_imm_v_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -364,7 +374,7 @@ define amdgpu_kernel void @lshr_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace ; ; VI-LABEL: lshr_imm_v_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: v_mov_b32_e32 v4, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -384,7 +394,7 @@ define amdgpu_kernel void @lshr_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace ; ; CI-LABEL: lshr_imm_v_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -404,7 +414,7 @@ define amdgpu_kernel void @lshr_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace ; ; GFX10-LABEL: lshr_imm_v_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -415,7 +425,9 @@ define amdgpu_kernel void @lshr_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace ; ; GFX11-LABEL: lshr_imm_v_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -438,7 +450,7 @@ define amdgpu_kernel void @lshr_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @lshr_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: lshr_v_imm_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -449,7 +461,7 @@ define amdgpu_kernel void @lshr_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace ; ; VI-LABEL: lshr_v_imm_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -468,7 +480,7 @@ define amdgpu_kernel void @lshr_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace ; ; CI-LABEL: lshr_v_imm_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -485,7 +497,7 @@ define amdgpu_kernel void @lshr_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace ; ; GFX10-LABEL: lshr_v_imm_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -496,7 +508,9 @@ define amdgpu_kernel void @lshr_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace ; ; GFX11-LABEL: lshr_v_imm_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -519,7 +533,7 @@ define amdgpu_kernel void @lshr_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @v_lshr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: v_lshr_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] @@ -531,7 +545,7 @@ define amdgpu_kernel void @v_lshr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; VI-LABEL: v_lshr_v4i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -553,7 +567,7 @@ define amdgpu_kernel void @v_lshr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; CI-LABEL: v_lshr_v4i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 @@ -582,7 +596,7 @@ define amdgpu_kernel void @v_lshr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX10-LABEL: v_lshr_v4i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] @@ -594,7 +608,9 @@ define amdgpu_kernel void @v_lshr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX11-LABEL: v_lshr_v4i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] @@ -620,7 +636,7 @@ define amdgpu_kernel void @v_lshr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @lshr_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: lshr_v_imm_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -632,7 +648,7 @@ define amdgpu_kernel void @lshr_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace ; ; VI-LABEL: lshr_v_imm_v4i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -654,7 +670,7 @@ define amdgpu_kernel void @lshr_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace ; ; CI-LABEL: lshr_v_imm_v4i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -673,7 +689,7 @@ define amdgpu_kernel void @lshr_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace ; ; GFX10-LABEL: lshr_v_imm_v4i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -685,7 +701,9 @@ define amdgpu_kernel void @lshr_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace ; ; GFX11-LABEL: lshr_v_imm_v4i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/madak.ll b/llvm/test/CodeGen/AMDGPU/madak.ll index 944db3d3adc3a..f2815915b8425 100644 --- a/llvm/test/CodeGen/AMDGPU/madak.ll +++ b/llvm/test/CodeGen/AMDGPU/madak.ll @@ -15,18 +15,18 @@ declare float @llvm.fabs.f32(float) nounwind readnone define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in.a, ptr addrspace(1) noalias %in.b) #0 { ; GFX6-LABEL: madak_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, 0 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX6-NEXT: s_mov_b32 s11, 0xf000 +; GFX6-NEXT: s_mov_b32 s10, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b64 s[0:1], s[6:7] +; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: v_mov_b32_e32 v1, 0 -; GFX6-NEXT: s_mov_b64 s[10:11], s[2:3] -; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 -; GFX6-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX6-NEXT: s_mov_b64 s[2:3], s[10:11] +; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 +; GFX6-NEXT: s_mov_b64 s[6:7], s[10:11] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_madak_f32 v2, v2, v3, 0x41200000 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 @@ -34,8 +34,8 @@ define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspac ; ; GFX8-LABEL: madak_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s7 @@ -56,12 +56,12 @@ define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspac ; ; GFX9-LABEL: madak_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NEXT: global_load_dword v2, v0, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_madak_f32 v1, v1, v2, 0x41200000 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -70,13 +70,13 @@ define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspac ; GFX10-MAD-LABEL: madak_f32: ; GFX10-MAD: ; %bb.0: ; GFX10-MAD-NEXT: s_clause 0x1 -; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-MAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-MAD-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-MAD-NEXT: s_clause 0x1 ; GFX10-MAD-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-MAD-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-MAD-NEXT: global_load_dword v2, v0, s[0:1] ; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) ; GFX10-MAD-NEXT: v_madak_f32 v1, v1, v2, 0x41200000 ; GFX10-MAD-NEXT: global_store_dword v0, v1, s[4:5] @@ -85,8 +85,10 @@ define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspac ; GFX11-MAD-LABEL: madak_f32: ; GFX11-MAD: ; %bb.0: ; GFX11-MAD-NEXT: s_clause 0x1 -; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-MAD-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-MAD-NEXT: s_clause 0x1 @@ -103,12 +105,13 @@ define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspac ; ; GFX940-FMA-LABEL: madak_f32: ; GFX940-FMA: ; %bb.0: -; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX940-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX940-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-FMA-NEXT: global_load_dword v1, v0, s[6:7] -; GFX940-FMA-NEXT: global_load_dword v2, v0, s[2:3] +; GFX940-FMA-NEXT: global_load_dword v2, v0, s[0:1] ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX940-FMA-NEXT: v_fmaak_f32 v1, v1, v2, 0x41200000 ; GFX940-FMA-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 @@ -117,13 +120,13 @@ define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspac ; GFX10-FMA-LABEL: madak_f32: ; GFX10-FMA: ; %bb.0: ; GFX10-FMA-NEXT: s_clause 0x1 -; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FMA-NEXT: s_clause 0x1 ; GFX10-FMA-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-FMA-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-FMA-NEXT: global_load_dword v2, v0, s[0:1] ; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX10-FMA-NEXT: v_fmaak_f32 v1, v1, v2, 0x41200000 ; GFX10-FMA-NEXT: global_store_dword v0, v1, s[4:5] @@ -132,8 +135,10 @@ define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspac ; GFX11-FMA-LABEL: madak_f32: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -165,7 +170,7 @@ define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspac define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 { ; GFX6-LABEL: madak_2_use_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -190,7 +195,7 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad ; ; GFX8-LABEL: madak_2_use_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -220,7 +225,7 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad ; ; GFX9-LABEL: madak_2_use_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x41200000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -240,7 +245,7 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad ; ; GFX10-MAD-LABEL: madak_2_use_f32: ; GFX10-MAD: ; %bb.0: -; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-MAD-NEXT: global_load_dword v1, v0, s[6:7] glc dlc @@ -259,7 +264,9 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad ; ; GFX11-MAD-LABEL: madak_2_use_f32: ; GFX11-MAD: ; %bb.0: -; GFX11-MAD-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-MAD-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-MAD-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -269,9 +276,9 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad ; GFX11-MAD-NEXT: global_load_b32 v3, v0, s[2:3] offset:8 glc dlc ; GFX11-MAD-NEXT: s_waitcnt vmcnt(0) ; GFX11-MAD-NEXT: v_mul_f32_e32 v2, v1, v2 -; GFX11-MAD-NEXT: v_mul_f32_e32 v1, v1, v3 -; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-MAD-NEXT: v_dual_add_f32 v1, 0x41200000, v1 :: v_dual_add_f32 v2, 0x41200000, v2 +; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-MAD-NEXT: v_dual_mul_f32 v1, v1, v3 :: v_dual_add_f32 v2, 0x41200000, v2 +; GFX11-MAD-NEXT: v_add_f32_e32 v1, 0x41200000, v1 ; GFX11-MAD-NEXT: global_store_b32 v0, v2, s[0:1] dlc ; GFX11-MAD-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[2:3] offset:4 dlc @@ -282,7 +289,8 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad ; ; GFX940-FMA-LABEL: madak_2_use_f32: ; GFX940-FMA: ; %bb.0: -; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-FMA-NEXT: v_mov_b32_e32 v4, 0x41200000 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) @@ -302,7 +310,7 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad ; ; GFX10-FMA-LABEL: madak_2_use_f32: ; GFX10-FMA: ; %bb.0: -; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FMA-NEXT: global_load_dword v1, v0, s[6:7] glc dlc @@ -321,7 +329,9 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad ; ; GFX11-FMA-LABEL: madak_2_use_f32: ; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -365,7 +375,7 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in.a) #0 { ; GFX6-LABEL: madak_m_inline_imm_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -381,7 +391,7 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out, ; ; GFX8-LABEL: madak_m_inline_imm_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -398,7 +408,7 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out, ; ; GFX9-LABEL: madak_m_inline_imm_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] @@ -409,7 +419,7 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out, ; ; GFX10-MAD-LABEL: madak_m_inline_imm_f32: ; GFX10-MAD: ; %bb.0: -; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-MAD-NEXT: global_load_dword v1, v0, s[6:7] @@ -420,13 +430,14 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out, ; ; GFX11-MAD-LABEL: madak_m_inline_imm_f32: ; GFX11-MAD: ; %bb.0: -; GFX11-MAD-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-MAD-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-MAD-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-MAD-NEXT: s_waitcnt vmcnt(0) ; GFX11-MAD-NEXT: v_mul_f32_e32 v1, 4.0, v1 -; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-MAD-NEXT: v_add_f32_e32 v1, 0x41200000, v1 ; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-MAD-NEXT: s_nop 0 @@ -435,7 +446,8 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out, ; ; GFX940-FMA-LABEL: madak_m_inline_imm_f32: ; GFX940-FMA: ; %bb.0: -; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-FMA-NEXT: global_load_dword v1, v0, s[6:7] @@ -446,7 +458,7 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out, ; ; GFX10-FMA-LABEL: madak_m_inline_imm_f32: ; GFX10-FMA: ; %bb.0: -; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FMA-NEXT: global_load_dword v1, v0, s[6:7] @@ -457,7 +469,9 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out, ; ; GFX11-FMA-LABEL: madak_m_inline_imm_f32: ; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3] @@ -484,18 +498,18 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out, define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in.a, ptr addrspace(1) noalias %in.b) #0 { ; GFX6-LABEL: madak_inline_imm_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, 0 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX6-NEXT: s_mov_b32 s11, 0xf000 +; GFX6-NEXT: s_mov_b32 s10, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b64 s[0:1], s[6:7] +; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: v_mov_b32_e32 v1, 0 -; GFX6-NEXT: s_mov_b64 s[10:11], s[2:3] -; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 -; GFX6-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX6-NEXT: s_mov_b64 s[2:3], s[10:11] +; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 +; GFX6-NEXT: s_mov_b64 s[6:7], s[10:11] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mad_f32 v2, v2, v3, 4.0 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 @@ -503,8 +517,8 @@ define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, p ; ; GFX8-LABEL: madak_inline_imm_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s7 @@ -525,12 +539,12 @@ define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, p ; ; GFX9-LABEL: madak_inline_imm_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NEXT: global_load_dword v2, v0, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mad_f32 v1, v1, v2, 4.0 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -539,13 +553,13 @@ define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, p ; GFX10-MAD-LABEL: madak_inline_imm_f32: ; GFX10-MAD: ; %bb.0: ; GFX10-MAD-NEXT: s_clause 0x1 -; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-MAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-MAD-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-MAD-NEXT: s_clause 0x1 ; GFX10-MAD-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-MAD-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-MAD-NEXT: global_load_dword v2, v0, s[0:1] ; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) ; GFX10-MAD-NEXT: v_mad_f32 v1, v1, v2, 4.0 ; GFX10-MAD-NEXT: global_store_dword v0, v1, s[4:5] @@ -554,8 +568,10 @@ define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, p ; GFX11-MAD-LABEL: madak_inline_imm_f32: ; GFX11-MAD: ; %bb.0: ; GFX11-MAD-NEXT: s_clause 0x1 -; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-MAD-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-MAD-NEXT: s_clause 0x1 @@ -572,12 +588,13 @@ define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, p ; ; GFX940-FMA-LABEL: madak_inline_imm_f32: ; GFX940-FMA: ; %bb.0: -; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX940-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX940-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-FMA-NEXT: global_load_dword v1, v0, s[6:7] -; GFX940-FMA-NEXT: global_load_dword v2, v0, s[2:3] +; GFX940-FMA-NEXT: global_load_dword v2, v0, s[0:1] ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX940-FMA-NEXT: v_fma_f32 v1, v1, v2, 4.0 ; GFX940-FMA-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 @@ -586,13 +603,13 @@ define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, p ; GFX10-FMA-LABEL: madak_inline_imm_f32: ; GFX10-FMA: ; %bb.0: ; GFX10-FMA-NEXT: s_clause 0x1 -; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FMA-NEXT: s_clause 0x1 ; GFX10-FMA-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-FMA-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-FMA-NEXT: global_load_dword v2, v0, s[0:1] ; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX10-FMA-NEXT: v_fma_f32 v1, v1, v2, 4.0 ; GFX10-FMA-NEXT: global_store_dword v0, v1, s[4:5] @@ -601,8 +618,10 @@ define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, p ; GFX11-FMA-LABEL: madak_inline_imm_f32: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -632,26 +651,26 @@ define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @s_v_madak_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in.a, float %b) #0 { ; GFX6-LABEL: s_v_madak_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX6-NEXT: s_load_dword s8, s[0:1], 0xd -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, 0 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s0, s[2:3], 0xd +; GFX6-NEXT: s_mov_b32 s11, 0xf000 +; GFX6-NEXT: s_mov_b32 s10, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b64 s[0:1], s[6:7] +; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: v_mov_b32_e32 v1, 0 -; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 +; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX6-NEXT: v_mov_b32_e32 v3, 0x41200000 -; GFX6-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX6-NEXT: s_mov_b64 s[6:7], s[10:11] ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mac_f32_e32 v3, s8, v2 +; GFX6-NEXT: v_mac_f32_e32 v3, s0, v2 ; GFX6-NEXT: buffer_store_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: s_v_madak_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s7 @@ -669,22 +688,23 @@ define amdgpu_kernel void @s_v_madak_f32(ptr addrspace(1) noalias %out, ptr addr ; ; GFX9-LABEL: s_v_madak_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x41200000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mac_f32_e32 v2, s2, v1 +; GFX9-NEXT: v_mac_f32_e32 v2, s0, v1 ; GFX9-NEXT: global_store_dword v0, v2, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-MAD-LABEL: s_v_madak_f32: ; GFX10-MAD: ; %bb.0: -; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-MAD-NEXT: s_clause 0x1 +; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-MAD-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-MAD-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-MAD-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) @@ -695,14 +715,15 @@ define amdgpu_kernel void @s_v_madak_f32(ptr addrspace(1) noalias %out, ptr addr ; GFX11-MAD-LABEL: s_v_madak_f32: ; GFX11-MAD: ; %bb.0: ; GFX11-MAD-NEXT: s_clause 0x1 -; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-MAD-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-MAD-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-MAD-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[6:7] ; GFX11-MAD-NEXT: s_waitcnt vmcnt(0) ; GFX11-MAD-NEXT: v_mul_f32_e32 v1, s0, v1 -; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-MAD-NEXT: v_add_f32_e32 v1, 0x41200000, v1 ; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX11-MAD-NEXT: s_nop 0 @@ -711,22 +732,24 @@ define amdgpu_kernel void @s_v_madak_f32(ptr addrspace(1) noalias %out, ptr addr ; ; GFX940-FMA-LABEL: s_v_madak_f32: ; GFX940-FMA: ; %bb.0: -; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX940-FMA-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-FMA-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX940-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-FMA-NEXT: v_mov_b32_e32 v2, 0x41200000 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-FMA-NEXT: global_load_dword v1, v0, s[6:7] ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX940-FMA-NEXT: v_fmac_f32_e32 v2, s2, v1 +; GFX940-FMA-NEXT: v_fmac_f32_e32 v2, s0, v1 ; GFX940-FMA-NEXT: global_store_dword v0, v2, s[4:5] sc0 sc1 ; GFX940-FMA-NEXT: s_endpgm ; ; GFX10-FMA-LABEL: s_v_madak_f32: ; GFX10-FMA: ; %bb.0: -; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-FMA-NEXT: s_clause 0x1 +; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-FMA-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-FMA-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FMA-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) @@ -737,8 +760,10 @@ define amdgpu_kernel void @s_v_madak_f32(ptr addrspace(1) noalias %out, ptr addr ; GFX11-FMA-LABEL: s_v_madak_f32: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-FMA-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-FMA-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7] @@ -763,82 +788,84 @@ define amdgpu_kernel void @s_v_madak_f32(ptr addrspace(1) noalias %out, ptr addr define amdgpu_kernel void @v_s_madak_f32(ptr addrspace(1) noalias %out, float %a, ptr addrspace(1) noalias %in.b) #0 { ; GFX6-LABEL: v_s_madak_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX6-NEXT: v_mov_b32_e32 v1, 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: s_load_dword s2, s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v3, 0x41200000 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mac_f32_e32 v3, s2, v2 +; GFX6-NEXT: v_mac_f32_e32 v3, s0, v2 ; GFX6-NEXT: buffer_store_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: v_s_madak_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX8-NEXT: s_load_dword s0, s[0:1], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dword s2, s[2:3], 0x2c ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x41200000 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mac_f32_e32 v2, s0, v3 +; GFX8-NEXT: v_mac_f32_e32 v2, s2, v3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: v_s_madak_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x41200000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mac_f32_e32 v2, s4, v1 -; GFX9-NEXT: global_store_dword v0, v2, s[2:3] +; GFX9-NEXT: global_store_dword v0, v2, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-MAD-LABEL: v_s_madak_f32: ; GFX10-MAD: ; %bb.0: -; GFX10-MAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-MAD-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-MAD-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-MAD-NEXT: global_load_dword v1, v0, s[0:1] ; GFX10-MAD-NEXT: s_clause 0x1 -; GFX10-MAD-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-MAD-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX10-MAD-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-MAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-MAD-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-MAD-NEXT: v_madak_f32 v1, s4, v1, 0x41200000 -; GFX10-MAD-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-MAD-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-MAD-NEXT: s_endpgm ; ; GFX11-MAD-LABEL: v_s_madak_f32: ; GFX11-MAD: ; %bb.0: -; GFX11-MAD-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 +; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-MAD-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-MAD-NEXT: s_clause 0x1 -; GFX11-MAD-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-MAD-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-MAD-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-MAD-NEXT: v_mul_f32_e32 v1, s2, v1 +; GFX11-MAD-NEXT: v_mul_f32_e32 v1, s4, v1 ; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-MAD-NEXT: v_add_f32_e32 v1, 0x41200000, v1 ; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[0:1] @@ -848,44 +875,47 @@ define amdgpu_kernel void @v_s_madak_f32(ptr addrspace(1) noalias %out, float %a ; ; GFX940-FMA-LABEL: v_s_madak_f32: ; GFX940-FMA: ; %bb.0: -; GFX940-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX940-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-FMA-NEXT: v_mov_b32_e32 v2, 0x41200000 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-FMA-NEXT: global_load_dword v1, v0, s[2:3] -; GFX940-FMA-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX940-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX940-FMA-NEXT: global_load_dword v1, v0, s[0:1] +; GFX940-FMA-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX940-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-FMA-NEXT: v_fmac_f32_e32 v2, s4, v1 -; GFX940-FMA-NEXT: global_store_dword v0, v2, s[2:3] sc0 sc1 +; GFX940-FMA-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 ; GFX940-FMA-NEXT: s_endpgm ; ; GFX10-FMA-LABEL: v_s_madak_f32: ; GFX10-FMA: ; %bb.0: -; GFX10-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-FMA-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-FMA-NEXT: global_load_dword v1, v0, s[0:1] ; GFX10-FMA-NEXT: s_clause 0x1 -; GFX10-FMA-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-FMA-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX10-FMA-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-FMA-NEXT: v_fmaak_f32 v1, s4, v1, 0x41200000 -; GFX10-FMA-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-FMA-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-FMA-NEXT: s_endpgm ; ; GFX11-FMA-LABEL: v_s_madak_f32: ; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-FMA-NEXT: v_fmaak_f32 v1, s2, v1, 0x41200000 +; GFX11-FMA-NEXT: v_fmaak_f32 v1, s4, v1, 0x41200000 ; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-FMA-NEXT: s_nop 0 ; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -905,7 +935,7 @@ define amdgpu_kernel void @v_s_madak_f32(ptr addrspace(1) noalias %out, float %a define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float %b) #0 { ; GFX6-LABEL: s_s_madak_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v0, 0x41200000 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 @@ -919,7 +949,7 @@ define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float ; ; GFX8-LABEL: s_s_madak_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x41200000 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s3 @@ -931,7 +961,7 @@ define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float ; ; GFX9-LABEL: s_s_madak_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x41200000 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -942,7 +972,7 @@ define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float ; ; GFX10-MAD-LABEL: s_s_madak_f32: ; GFX10-MAD: ; %bb.0: -; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-MAD-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-MAD-NEXT: v_mov_b32_e32 v0, s7 @@ -952,7 +982,7 @@ define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float ; ; GFX11-MAD-LABEL: s_s_madak_f32: ; GFX11-MAD: ; %bb.0: -; GFX11-MAD-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-MAD-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-MAD-NEXT: v_mul_f32_e64 v0, s2, s3 ; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -964,7 +994,7 @@ define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float ; ; GFX940-FMA-LABEL: s_s_madak_f32: ; GFX940-FMA: ; %bb.0: -; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX940-FMA-NEXT: v_mov_b32_e32 v1, 0x41200000 ; GFX940-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) @@ -975,7 +1005,7 @@ define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float ; ; GFX10-FMA-LABEL: s_s_madak_f32: ; GFX10-FMA: ; %bb.0: -; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-FMA-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FMA-NEXT: v_mov_b32_e32 v0, s7 @@ -985,7 +1015,7 @@ define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float ; ; GFX11-FMA-LABEL: s_s_madak_f32: ; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3 ; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1003,19 +1033,19 @@ define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in.a, ptr addrspace(1) noalias %in.b) #0 { ; GFX6-LABEL: no_madak_src0_modifier_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, 0 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX6-NEXT: s_mov_b32 s11, 0xf000 +; GFX6-NEXT: s_mov_b32 s10, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b64 s[0:1], s[6:7] +; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: v_mov_b32_e32 v1, 0 -; GFX6-NEXT: s_mov_b64 s[10:11], s[2:3] -; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 +; GFX6-NEXT: s_mov_b64 s[2:3], s[10:11] +; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 ; GFX6-NEXT: s_mov_b32 s0, 0x41200000 -; GFX6-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX6-NEXT: s_mov_b64 s[6:7], s[10:11] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mad_f32 v2, |v2|, v3, s0 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 @@ -1023,8 +1053,8 @@ define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias % ; ; GFX8-LABEL: no_madak_src0_modifier_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s7 @@ -1046,13 +1076,13 @@ define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias % ; ; GFX9-LABEL: no_madak_src0_modifier_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s0, 0x41200000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-NEXT: s_mov_b32 s0, 0x41200000 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mad_f32 v1, |v1|, v2, s0 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -1061,13 +1091,13 @@ define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias % ; GFX10-MAD-LABEL: no_madak_src0_modifier_f32: ; GFX10-MAD: ; %bb.0: ; GFX10-MAD-NEXT: s_clause 0x1 -; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-MAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-MAD-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-MAD-NEXT: s_clause 0x1 ; GFX10-MAD-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-MAD-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-MAD-NEXT: global_load_dword v2, v0, s[0:1] ; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) ; GFX10-MAD-NEXT: v_mad_f32 v1, |v1|, v2, 0x41200000 ; GFX10-MAD-NEXT: global_store_dword v0, v1, s[4:5] @@ -1076,8 +1106,10 @@ define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias % ; GFX11-MAD-LABEL: no_madak_src0_modifier_f32: ; GFX11-MAD: ; %bb.0: ; GFX11-MAD-NEXT: s_clause 0x1 -; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-MAD-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-MAD-NEXT: s_clause 0x1 @@ -1094,13 +1126,14 @@ define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias % ; ; GFX940-FMA-LABEL: no_madak_src0_modifier_f32: ; GFX940-FMA: ; %bb.0: -; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX940-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX940-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX940-FMA-NEXT: s_mov_b32 s0, 0x41200000 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-FMA-NEXT: global_load_dword v1, v0, s[6:7] -; GFX940-FMA-NEXT: global_load_dword v2, v0, s[2:3] +; GFX940-FMA-NEXT: global_load_dword v2, v0, s[0:1] +; GFX940-FMA-NEXT: s_mov_b32 s0, 0x41200000 ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX940-FMA-NEXT: v_fma_f32 v1, |v1|, v2, s0 ; GFX940-FMA-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 @@ -1109,13 +1142,13 @@ define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias % ; GFX10-FMA-LABEL: no_madak_src0_modifier_f32: ; GFX10-FMA: ; %bb.0: ; GFX10-FMA-NEXT: s_clause 0x1 -; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FMA-NEXT: s_clause 0x1 ; GFX10-FMA-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-FMA-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-FMA-NEXT: global_load_dword v2, v0, s[0:1] ; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX10-FMA-NEXT: v_fma_f32 v1, |v1|, v2, 0x41200000 ; GFX10-FMA-NEXT: global_store_dword v0, v1, s[4:5] @@ -1124,8 +1157,10 @@ define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias % ; GFX11-FMA-LABEL: no_madak_src0_modifier_f32: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -1156,19 +1191,19 @@ define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias % define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in.a, ptr addrspace(1) noalias %in.b) #0 { ; GFX6-LABEL: no_madak_src1_modifier_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, 0 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX6-NEXT: s_mov_b32 s11, 0xf000 +; GFX6-NEXT: s_mov_b32 s10, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b64 s[0:1], s[6:7] +; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: v_mov_b32_e32 v1, 0 -; GFX6-NEXT: s_mov_b64 s[10:11], s[2:3] -; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 +; GFX6-NEXT: s_mov_b64 s[2:3], s[10:11] +; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 ; GFX6-NEXT: s_mov_b32 s0, 0x41200000 -; GFX6-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX6-NEXT: s_mov_b64 s[6:7], s[10:11] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mad_f32 v2, v2, |v3|, s0 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 @@ -1176,8 +1211,8 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias % ; ; GFX8-LABEL: no_madak_src1_modifier_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s7 @@ -1199,13 +1234,13 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias % ; ; GFX9-LABEL: no_madak_src1_modifier_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s0, 0x41200000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-NEXT: s_mov_b32 s0, 0x41200000 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mad_f32 v1, v1, |v2|, s0 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -1214,13 +1249,13 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias % ; GFX10-MAD-LABEL: no_madak_src1_modifier_f32: ; GFX10-MAD: ; %bb.0: ; GFX10-MAD-NEXT: s_clause 0x1 -; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-MAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-MAD-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-MAD-NEXT: s_clause 0x1 ; GFX10-MAD-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-MAD-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-MAD-NEXT: global_load_dword v2, v0, s[0:1] ; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) ; GFX10-MAD-NEXT: v_mad_f32 v1, v1, |v2|, 0x41200000 ; GFX10-MAD-NEXT: global_store_dword v0, v1, s[4:5] @@ -1229,8 +1264,10 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias % ; GFX11-MAD-LABEL: no_madak_src1_modifier_f32: ; GFX11-MAD: ; %bb.0: ; GFX11-MAD-NEXT: s_clause 0x1 -; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-MAD-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-MAD-NEXT: s_clause 0x1 @@ -1247,13 +1284,14 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias % ; ; GFX940-FMA-LABEL: no_madak_src1_modifier_f32: ; GFX940-FMA: ; %bb.0: -; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX940-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX940-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX940-FMA-NEXT: s_mov_b32 s0, 0x41200000 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-FMA-NEXT: global_load_dword v1, v0, s[6:7] -; GFX940-FMA-NEXT: global_load_dword v2, v0, s[2:3] +; GFX940-FMA-NEXT: global_load_dword v2, v0, s[0:1] +; GFX940-FMA-NEXT: s_mov_b32 s0, 0x41200000 ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX940-FMA-NEXT: v_fma_f32 v1, v1, |v2|, s0 ; GFX940-FMA-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 @@ -1262,13 +1300,13 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias % ; GFX10-FMA-LABEL: no_madak_src1_modifier_f32: ; GFX10-FMA: ; %bb.0: ; GFX10-FMA-NEXT: s_clause 0x1 -; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FMA-NEXT: s_clause 0x1 ; GFX10-FMA-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-FMA-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-FMA-NEXT: global_load_dword v2, v0, s[0:1] ; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX10-FMA-NEXT: v_fma_f32 v1, v1, |v2|, 0x41200000 ; GFX10-FMA-NEXT: global_store_dword v0, v1, s[4:5] @@ -1277,8 +1315,10 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias % ; GFX11-FMA-LABEL: no_madak_src1_modifier_f32: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -1312,36 +1352,36 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias % define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], float %sgpr0, float %sgpr1) #0 { ; GFX6-LABEL: madak_constant_bus_violation: ; GFX6: ; %bb.0: ; %bb -; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[2:3], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_cmp_lg_u32 s2, 0 +; GFX6-NEXT: s_cmp_lg_u32 s0, 0 ; GFX6-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX6-NEXT: ; %bb.1: ; %bb3 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, 0 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: .LBB9_2: ; %bb4 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; GFX6-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_load_dword s0, s[0:1], 0x12 +; GFX6-NEXT: s_load_dword s0, s[2:3], 0x12 ; GFX6-NEXT: v_mov_b32_e32 v1, 0x42280000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mac_f32_e64 v1, s0, 0.5 ; GFX6-NEXT: v_mul_f32_e32 v0, v1, v0 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: madak_constant_bus_violation: ; GFX8: ; %bb.0: ; %bb -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX8-NEXT: s_load_dword s0, s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_cmp_lg_u32 s2, 0 +; GFX8-NEXT: s_cmp_lg_u32 s0, 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX8-NEXT: ; %bb.1: ; %bb3 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 @@ -1350,7 +1390,7 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; GFX8-NEXT: .LBB9_2: ; %bb4 ; GFX8-NEXT: flat_load_dword v0, v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: s_load_dword s0, s[0:1], 0x48 +; GFX8-NEXT: s_load_dword s0, s[2:3], 0x48 ; GFX8-NEXT: v_mov_b32_e32 v1, 0x42280000 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mac_f32_e64 v1, s0, 0.5 @@ -1361,9 +1401,9 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; ; GFX9-LABEL: madak_constant_bus_violation: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_cmp_lg_u32 s2, 0 +; GFX9-NEXT: s_cmp_lg_u32 s0, 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX9-NEXT: ; %bb.1: ; %bb3 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -1372,7 +1412,7 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; GFX9-NEXT: .LBB9_2: ; %bb4 ; GFX9-NEXT: global_load_dword v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x48 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x48 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x42280000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mac_f32_e64 v1, s0, 0.5 @@ -1383,9 +1423,9 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; ; GFX10-MAD-LABEL: madak_constant_bus_violation: ; GFX10-MAD: ; %bb.0: ; %bb -; GFX10-MAD-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX10-MAD-NEXT: s_load_dword s0, s[2:3], 0x24 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-MAD-NEXT: s_cmp_lg_u32 s2, 0 +; GFX10-MAD-NEXT: s_cmp_lg_u32 s0, 0 ; GFX10-MAD-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX10-MAD-NEXT: ; %bb.1: ; %bb3 ; GFX10-MAD-NEXT: v_mov_b32_e32 v0, 0 @@ -1394,7 +1434,7 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; GFX10-MAD-NEXT: .LBB9_2: ; %bb4 ; GFX10-MAD-NEXT: global_load_dword v0, v[0:1], off glc dlc ; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) -; GFX10-MAD-NEXT: s_load_dword s0, s[0:1], 0x48 +; GFX10-MAD-NEXT: s_load_dword s0, s[2:3], 0x48 ; GFX10-MAD-NEXT: v_mov_b32_e32 v1, 0.5 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-MAD-NEXT: v_madak_f32 v1, s0, v1, 0x42280000 @@ -1405,9 +1445,9 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; ; GFX11-MAD-LABEL: madak_constant_bus_violation: ; GFX11-MAD: ; %bb.0: ; %bb -; GFX11-MAD-NEXT: s_load_b32 s2, s[0:1], 0x24 +; GFX11-MAD-NEXT: s_load_b32 s0, s[2:3], 0x24 ; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-MAD-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-MAD-NEXT: s_cmp_lg_u32 s0, 0 ; GFX11-MAD-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX11-MAD-NEXT: ; %bb.1: ; %bb3 ; GFX11-MAD-NEXT: v_mov_b32_e32 v0, 0 @@ -1416,7 +1456,7 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; GFX11-MAD-NEXT: .LBB9_2: ; %bb4 ; GFX11-MAD-NEXT: global_load_b32 v0, v[0:1], off glc dlc ; GFX11-MAD-NEXT: s_waitcnt vmcnt(0) -; GFX11-MAD-NEXT: s_load_b32 s0, s[0:1], 0x48 +; GFX11-MAD-NEXT: s_load_b32 s0, s[2:3], 0x48 ; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-MAD-NEXT: v_mul_f32_e64 v1, s0, 0.5 ; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1430,9 +1470,9 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; ; GFX940-FMA-LABEL: madak_constant_bus_violation: ; GFX940-FMA: ; %bb.0: ; %bb -; GFX940-FMA-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX940-FMA-NEXT: s_load_dword s0, s[2:3], 0x24 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-FMA-NEXT: s_cmp_lg_u32 s2, 0 +; GFX940-FMA-NEXT: s_cmp_lg_u32 s0, 0 ; GFX940-FMA-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX940-FMA-NEXT: ; %bb.1: ; %bb3 ; GFX940-FMA-NEXT: v_mov_b32_e32 v0, 0 @@ -1441,7 +1481,7 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; GFX940-FMA-NEXT: .LBB9_2: ; %bb4 ; GFX940-FMA-NEXT: global_load_dword v0, v[0:1], off sc0 sc1 ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX940-FMA-NEXT: s_load_dword s0, s[0:1], 0x48 +; GFX940-FMA-NEXT: s_load_dword s0, s[2:3], 0x48 ; GFX940-FMA-NEXT: v_mov_b32_e32 v1, 0x42280000 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-FMA-NEXT: v_fmac_f32_e64 v1, s0, 0.5 @@ -1452,9 +1492,9 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; ; GFX10-FMA-LABEL: madak_constant_bus_violation: ; GFX10-FMA: ; %bb.0: ; %bb -; GFX10-FMA-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX10-FMA-NEXT: s_load_dword s0, s[2:3], 0x24 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-FMA-NEXT: s_cmp_lg_u32 s2, 0 +; GFX10-FMA-NEXT: s_cmp_lg_u32 s0, 0 ; GFX10-FMA-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX10-FMA-NEXT: ; %bb.1: ; %bb3 ; GFX10-FMA-NEXT: v_mov_b32_e32 v0, 0 @@ -1463,7 +1503,7 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; GFX10-FMA-NEXT: .LBB9_2: ; %bb4 ; GFX10-FMA-NEXT: global_load_dword v0, v[0:1], off glc dlc ; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX10-FMA-NEXT: s_load_dword s0, s[0:1], 0x48 +; GFX10-FMA-NEXT: s_load_dword s0, s[2:3], 0x48 ; GFX10-FMA-NEXT: v_mov_b32_e32 v1, 0.5 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FMA-NEXT: v_fmaak_f32 v1, s0, v1, 0x42280000 @@ -1474,9 +1514,9 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; ; GFX11-FMA-LABEL: madak_constant_bus_violation: ; GFX11-FMA: ; %bb.0: ; %bb -; GFX11-FMA-NEXT: s_load_b32 s2, s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b32 s0, s[2:3], 0x24 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FMA-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-FMA-NEXT: s_cmp_lg_u32 s0, 0 ; GFX11-FMA-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX11-FMA-NEXT: ; %bb.1: ; %bb3 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 @@ -1485,7 +1525,7 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; GFX11-FMA-NEXT: .LBB9_2: ; %bb4 ; GFX11-FMA-NEXT: global_load_b32 v0, v[0:1], off glc dlc ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-FMA-NEXT: s_load_b32 s0, s[0:1], 0x48 +; GFX11-FMA-NEXT: s_load_b32 s0, s[2:3], 0x48 ; GFX11-FMA-NEXT: v_mov_b32_e32 v1, 0.5 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) diff --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll b/llvm/test/CodeGen/AMDGPU/memory_clause.ll index 86a5055ab0704..ed1283af5c476 100644 --- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll +++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @vector_clause(ptr addrspace(1) noalias nocapture readonly %arg, ptr addrspace(1) noalias nocapture %arg1) { ; GCN-LABEL: vector_clause: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v16, 4, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[4:5] @@ -24,7 +24,7 @@ define amdgpu_kernel void @vector_clause(ptr addrspace(1) noalias nocapture read ; ; GCN-SCRATCH-LABEL: vector_clause: ; GCN-SCRATCH: ; %bb.0: ; %bb -; GCN-SCRATCH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-SCRATCH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-SCRATCH-NEXT: v_lshlrev_b32_e32 v16, 4, v0 ; GCN-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-SCRATCH-NEXT: s_clause 0x3 @@ -69,7 +69,7 @@ bb: define amdgpu_kernel void @scalar_clause(ptr addrspace(1) noalias nocapture readonly %arg, ptr addrspace(1) noalias nocapture %arg1) { ; GCN-LABEL: scalar_clause: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 ; GCN-NEXT: v_mov_b32_e32 v16, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 @@ -98,7 +98,7 @@ define amdgpu_kernel void @scalar_clause(ptr addrspace(1) noalias nocapture read ; ; GCN-SCRATCH-LABEL: scalar_clause: ; GCN-SCRATCH: ; %bb.0: ; %bb -; GCN-SCRATCH-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 +; GCN-SCRATCH-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 ; GCN-SCRATCH-NEXT: v_mov_b32_e32 v16, 0 ; GCN-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-SCRATCH-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 @@ -250,11 +250,11 @@ bb: define amdgpu_kernel void @vector_clause_indirect(ptr addrspace(1) noalias nocapture readonly %arg, ptr addrspace(1) noalias nocapture readnone %arg1, ptr addrspace(1) noalias nocapture %arg2) { ; GCN-LABEL: vector_clause_indirect: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dwordx2 v[8:9], v0, s[2:3] +; GCN-NEXT: global_load_dwordx2 v[8:9], v0, s[0:1] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: global_load_dwordx4 v[0:3], v[8:9], off ; GCN-NEXT: global_load_dwordx4 v[4:7], v[8:9], off offset:16 @@ -267,20 +267,20 @@ define amdgpu_kernel void @vector_clause_indirect(ptr addrspace(1) noalias nocap ; ; GCN-SCRATCH-LABEL: vector_clause_indirect: ; GCN-SCRATCH: ; %bb.0: ; %bb -; GCN-SCRATCH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-SCRATCH-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GCN-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN-SCRATCH-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x34 ; GCN-SCRATCH-NEXT: v_mov_b32_e32 v8, 0 ; GCN-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) -; GCN-SCRATCH-NEXT: global_load_dwordx2 v[4:5], v0, s[2:3] +; GCN-SCRATCH-NEXT: global_load_dwordx2 v[4:5], v0, s[0:1] ; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GCN-SCRATCH-NEXT: s_clause 0x1 ; GCN-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[4:5], off ; GCN-SCRATCH-NEXT: global_load_dwordx4 v[4:7], v[4:5], off offset:16 ; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(1) -; GCN-SCRATCH-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GCN-SCRATCH-NEXT: global_store_dwordx4 v8, v[0:3], s[2:3] ; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GCN-SCRATCH-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GCN-SCRATCH-NEXT: global_store_dwordx4 v8, v[4:7], s[2:3] offset:16 ; GCN-SCRATCH-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -384,10 +384,10 @@ define amdgpu_kernel void @flat_scratch_load(float %a, float %b, <8 x i32> %desc ; GCN-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 ; GCN-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 ; GCN-NEXT: s_mov_b32 s18, -1 -; GCN-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x24 -; GCN-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 ; GCN-NEXT: s_mov_b32 s19, 0xe00000 -; GCN-NEXT: s_add_u32 s16, s16, s3 +; GCN-NEXT: s_add_u32 s16, s16, s9 +; GCN-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 ; GCN-NEXT: s_addc_u32 s17, s17, 0 ; GCN-NEXT: v_mov_b32_e32 v0, 0x40b00000 ; GCN-NEXT: buffer_store_dword v0, off, s[16:19], 0 @@ -411,13 +411,13 @@ define amdgpu_kernel void @flat_scratch_load(float %a, float %b, <8 x i32> %desc ; ; GCN-SCRATCH-LABEL: flat_scratch_load: ; GCN-SCRATCH: ; %bb.0: ; %.entry -; GCN-SCRATCH-NEXT: s_add_u32 s2, s2, s5 -; GCN-SCRATCH-NEXT: s_addc_u32 s3, s3, 0 -; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 -; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GCN-SCRATCH-NEXT: s_add_u32 s6, s6, s11 +; GCN-SCRATCH-NEXT: s_addc_u32 s7, s7, 0 +; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 +; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 ; GCN-SCRATCH-NEXT: s_clause 0x1 -; GCN-SCRATCH-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x24 -; GCN-SCRATCH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x44 +; GCN-SCRATCH-NEXT: s_load_dwordx2 s[10:11], s[2:3], 0x24 +; GCN-SCRATCH-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x44 ; GCN-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x40b00000 ; GCN-SCRATCH-NEXT: s_brev_b32 s8, 1 ; GCN-SCRATCH-NEXT: s_mov_b32 s9, s8 @@ -453,22 +453,22 @@ define amdgpu_kernel void @flat_scratch_load(float %a, float %b, <8 x i32> %desc define amdgpu_kernel void @flat_scratch_load_clause(float %a, float %b, <8 x i32> %desc) { ; GCN-LABEL: flat_scratch_load_clause: ; GCN: ; %bb.0: ; %.entry -; GCN-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 -; GCN-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 -; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: s_mov_b32 s7, 0xe00000 -; GCN-NEXT: s_add_u32 s4, s4, s3 -; GCN-NEXT: s_addc_u32 s5, s5, 0 +; GCN-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN-NEXT: s_mov_b32 s14, -1 +; GCN-NEXT: s_mov_b32 s15, 0xe00000 +; GCN-NEXT: s_add_u32 s12, s12, s9 +; GCN-NEXT: s_addc_u32 s13, s13, 0 ; GCN-NEXT: v_mov_b32_e32 v0, 0x40b00000 -; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, 0x40d00000 -; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4 +; GCN-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: buffer_load_dword v0, off, s[4:7], 0 -; GCN-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:4 +; GCN-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; GCN-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_add_f32_e32 v0, v0, v1 ; GCN-NEXT: exp mrt0 v0, off, off, off done vm @@ -476,10 +476,10 @@ define amdgpu_kernel void @flat_scratch_load_clause(float %a, float %b, <8 x i32 ; ; GCN-SCRATCH-LABEL: flat_scratch_load_clause: ; GCN-SCRATCH: ; %bb.0: ; %.entry -; GCN-SCRATCH-NEXT: s_add_u32 s2, s2, s5 -; GCN-SCRATCH-NEXT: s_addc_u32 s3, s3, 0 -; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 -; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GCN-SCRATCH-NEXT: s_add_u32 s6, s6, s11 +; GCN-SCRATCH-NEXT: s_addc_u32 s7, s7, 0 +; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 +; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 ; GCN-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x40b00000 ; GCN-SCRATCH-NEXT: v_mov_b32_e32 v1, 0x40d00000 ; GCN-SCRATCH-NEXT: scratch_store_dword off, v0, off diff --git a/llvm/test/CodeGen/AMDGPU/merge-s-load.mir b/llvm/test/CodeGen/AMDGPU/merge-s-load.mir index b08da2e1848ff..28d30a70e50c1 100644 --- a/llvm/test/CodeGen/AMDGPU/merge-s-load.mir +++ b/llvm/test/CodeGen/AMDGPU/merge-s-load.mir @@ -1,4 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2 +# RUN: llc -mtriple=amdgcn -mcpu=gfx1013 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck %s -check-prefixes=CHECK,GFX10 # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck %s -check-prefixes=CHECK,GFX11 # RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck %s -check-prefixes=CHECK,GFX12 @@ -6,11 +7,23 @@ name: merge_s_load_x1_x1 body: | bb.0: - ; CHECK-LABEL: name: merge_s_load_x1_x1 - ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF - ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s64), align 4) - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_LOAD_DWORDX2_IMM]].sub0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_LOAD_DWORDX2_IMM]].sub1 + ; GFX10-LABEL: name: merge_s_load_x1_x1 + ; GFX10: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF + ; GFX10-NEXT: early-clobber %3:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s64), align 4) + ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_32_xm0_xexec = COPY %3.sub0 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed %3.sub1 + ; + ; GFX11-LABEL: name: merge_s_load_x1_x1 + ; GFX11: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF + ; GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s64), align 4) + ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_LOAD_DWORDX2_IMM]].sub0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_LOAD_DWORDX2_IMM]].sub1 + ; + ; GFX12-LABEL: name: merge_s_load_x1_x1 + ; GFX12: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF + ; GFX12-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s64), align 4) + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_LOAD_DWORDX2_IMM]].sub0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_LOAD_DWORDX2_IMM]].sub1 %0:sgpr_64 = IMPLICIT_DEF %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s32)) %2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 4, 0 :: (dereferenceable invariant load (s32)) @@ -46,6 +59,13 @@ body: | name: merge_s_load_x1_x1_x1 body: | bb.0: + ; GFX10-LABEL: name: merge_s_load_x1_x1_x1 + ; GFX10: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF + ; GFX10-NEXT: early-clobber %4:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s64), align 4) + ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_32_xm0_xexec = COPY %4.sub0 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed %4.sub1 + ; GFX10-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 8, 0 :: (dereferenceable invariant load (s32)) + ; ; GFX11-LABEL: name: merge_s_load_x1_x1_x1 ; GFX11: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF ; GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s64), align 4) @@ -70,6 +90,16 @@ body: | name: merge_s_load_x1_x1_x1_x1 body: | bb.0: + ; GFX10-LABEL: name: merge_s_load_x1_x1_x1_x1 + ; GFX10: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF + ; GFX10-NEXT: early-clobber %7:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64_xexec = COPY %7.sub0_sub1 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY killed %7.sub2_sub3 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY]].sub0 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY]].sub1 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY1]].sub0 + ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY1]].sub1 + ; ; GFX11-LABEL: name: merge_s_load_x1_x1_x1_x1 ; GFX11: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF ; GFX11-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 4) @@ -100,6 +130,24 @@ body: | name: merge_s_load_x1_x1_x1_x1_x1_x1_x1_x1 body: | bb.0: + ; GFX10-LABEL: name: merge_s_load_x1_x1_x1_x1_x1_x1_x1_x1 + ; GFX10: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF + ; GFX10-NEXT: early-clobber %15:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 4) + ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %15.sub0_sub1_sub2_sub3 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed %15.sub4_sub5_sub6_sub7 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_64_xexec = COPY [[COPY]].sub0_sub1 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY killed [[COPY]].sub2_sub3 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY2]].sub0 + ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY2]].sub1 + ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY3]].sub0 + ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY3]].sub1 + ; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_64_xexec = COPY [[COPY1]].sub0_sub1 + ; GFX10-NEXT: [[COPY9:%[0-9]+]]:sreg_64_xexec = COPY killed [[COPY1]].sub2_sub3 + ; GFX10-NEXT: [[COPY10:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY8]].sub0 + ; GFX10-NEXT: [[COPY11:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY8]].sub1 + ; GFX10-NEXT: [[COPY12:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY9]].sub0 + ; GFX10-NEXT: [[COPY13:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY9]].sub1 + ; ; GFX11-LABEL: name: merge_s_load_x1_x1_x1_x1_x1_x1_x1_x1 ; GFX11: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF ; GFX11-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 4) @@ -150,6 +198,11 @@ body: | name: merge_s_load_x2_x1 body: | bb.0: + ; GFX10-LABEL: name: merge_s_load_x2_x1 + ; GFX10: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF + ; GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sgpr_64 = S_LOAD_DWORDX2_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s64)) + ; GFX10-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 8, 0 :: (dereferenceable invariant load (s32)) + ; ; GFX11-LABEL: name: merge_s_load_x2_x1 ; GFX11: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF ; GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sgpr_64 = S_LOAD_DWORDX2_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s64)) @@ -169,11 +222,23 @@ body: | name: merge_s_load_x2_x2 body: | bb.0: - ; CHECK-LABEL: name: merge_s_load_x2_x2 - ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 8) - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_64 = COPY killed [[S_LOAD_DWORDX4_IMM]].sub2_sub3 + ; GFX10-LABEL: name: merge_s_load_x2_x2 + ; GFX10: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF + ; GFX10-NEXT: early-clobber %3:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 8) + ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY %3.sub0_sub1 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_64 = COPY killed %3.sub2_sub3 + ; + ; GFX11-LABEL: name: merge_s_load_x2_x2 + ; GFX11: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF + ; GFX11-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 8) + ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_64 = COPY killed [[S_LOAD_DWORDX4_IMM]].sub2_sub3 + ; + ; GFX12-LABEL: name: merge_s_load_x2_x2 + ; GFX12: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF + ; GFX12-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 8) + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_64 = COPY killed [[S_LOAD_DWORDX4_IMM]].sub2_sub3 %0:sgpr_64 = IMPLICIT_DEF %1:sgpr_64 = S_LOAD_DWORDX2_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s64)) %2:sgpr_64 = S_LOAD_DWORDX2_IMM %0:sgpr_64, 8, 0 :: (dereferenceable invariant load (s64)) @@ -183,15 +248,35 @@ body: | name: merge_s_load_x2_x2_x2_x2 body: | bb.0: - ; CHECK-LABEL: name: merge_s_load_x2_x2_x2_x2 - ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF - ; CHECK-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 8) - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed [[S_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_64 = COPY [[COPY]].sub0_sub1 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY killed [[COPY]].sub2_sub3 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY [[COPY1]].sub0_sub1 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY killed [[COPY1]].sub2_sub3 + ; GFX10-LABEL: name: merge_s_load_x2_x2_x2_x2 + ; GFX10: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF + ; GFX10-NEXT: early-clobber %7:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 8) + ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %7.sub0_sub1_sub2_sub3 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed %7.sub4_sub5_sub6_sub7 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_64 = COPY [[COPY]].sub0_sub1 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY killed [[COPY]].sub2_sub3 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY [[COPY1]].sub0_sub1 + ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY killed [[COPY1]].sub2_sub3 + ; + ; GFX11-LABEL: name: merge_s_load_x2_x2_x2_x2 + ; GFX11: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF + ; GFX11-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 8) + ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed [[S_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_64 = COPY [[COPY]].sub0_sub1 + ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY killed [[COPY]].sub2_sub3 + ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY [[COPY1]].sub0_sub1 + ; GFX11-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY killed [[COPY1]].sub2_sub3 + ; + ; GFX12-LABEL: name: merge_s_load_x2_x2_x2_x2 + ; GFX12: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF + ; GFX12-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 8) + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed [[S_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_64 = COPY [[COPY]].sub0_sub1 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY killed [[COPY]].sub2_sub3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY [[COPY1]].sub0_sub1 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY killed [[COPY1]].sub2_sub3 %0:sgpr_64 = IMPLICIT_DEF %1:sgpr_64 = S_LOAD_DWORDX2_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s64)) %2:sgpr_64 = S_LOAD_DWORDX2_IMM %0:sgpr_64, 8, 0 :: (dereferenceable invariant load (s64)) @@ -217,12 +302,181 @@ body: | name: merge_s_load_x4_x4 body: | bb.0: - ; CHECK-LABEL: name: merge_s_load_x4_x4 - ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF - ; CHECK-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 16) - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed [[S_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7 + ; GFX10-LABEL: name: merge_s_load_x4_x4 + ; GFX10: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF + ; GFX10-NEXT: early-clobber %3:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 16) + ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %3.sub0_sub1_sub2_sub3 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed %3.sub4_sub5_sub6_sub7 + ; + ; GFX11-LABEL: name: merge_s_load_x4_x4 + ; GFX11: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF + ; GFX11-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 16) + ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed [[S_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7 + ; + ; GFX12-LABEL: name: merge_s_load_x4_x4 + ; GFX12: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF + ; GFX12-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 16) + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed [[S_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7 %0:sgpr_64 = IMPLICIT_DEF %1:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s128)) %2:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 16, 0 :: (dereferenceable invariant load (s128)) ... + +# The constrained multi-dword scalar load merge tests. +--- +name: merge_s_load_x1_x2ec +body: | + bb.0: + ; CHECK-LABEL: name: merge_s_load_x1_x2ec + ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF + ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: early-clobber %2:sgpr_64 = S_LOAD_DWORDX2_IMM_ec [[DEF]], 4, 0 :: (dereferenceable invariant load (s64)) + %0:sgpr_64 = IMPLICIT_DEF + %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s32)) + early-clobber %2:sgpr_64 = S_LOAD_DWORDX2_IMM_ec %0:sgpr_64, 4, 0 :: (dereferenceable invariant load (s64)) +... + +--- +name: merge_s_load_x1_x3ec +body: | + bb.0: + ; CHECK-LABEL: name: merge_s_load_x1_x3ec + ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF + ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: early-clobber %2:sgpr_96 = S_LOAD_DWORDX3_IMM_ec [[DEF]], 4, 0 :: (dereferenceable invariant load (s96), align 16) + %0:sgpr_64 = IMPLICIT_DEF + %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s32)) + early-clobber %2:sgpr_96 = S_LOAD_DWORDX3_IMM_ec %0:sgpr_64, 4, 0 :: (dereferenceable invariant load (s96)) +... + +--- +name: merge_s_load_x2ec_x1 +body: | + bb.0: + ; GFX10-LABEL: name: merge_s_load_x2ec_x1 + ; GFX10: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF + ; GFX10-NEXT: early-clobber %1:sgpr_64 = S_LOAD_DWORDX2_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s64)) + ; GFX10-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 8, 0 :: (dereferenceable invariant load (s32)) + ; + ; GFX11-LABEL: name: merge_s_load_x2ec_x1 + ; GFX11: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF + ; GFX11-NEXT: early-clobber %1:sgpr_64 = S_LOAD_DWORDX2_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s64)) + ; GFX11-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 8, 0 :: (dereferenceable invariant load (s32)) + ; + ; GFX12-LABEL: name: merge_s_load_x2ec_x1 + ; GFX12: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF + ; GFX12-NEXT: [[S_LOAD_DWORDX3_IMM:%[0-9]+]]:sgpr_96 = S_LOAD_DWORDX3_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s96), align 8) + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY [[S_LOAD_DWORDX3_IMM]].sub0_sub1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_LOAD_DWORDX3_IMM]].sub2 + %0:sgpr_64 = IMPLICIT_DEF + early-clobber %1:sgpr_64 = S_LOAD_DWORDX2_IMM_ec %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s64)) + %2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 8, 0 :: (dereferenceable invariant load (s32)) +... + +--- +name: merge_s_load_x2ec_x2ec +body: | + bb.0: + ; GFX10-LABEL: name: merge_s_load_x2ec_x2ec + ; GFX10: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF + ; GFX10-NEXT: early-clobber %3:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 8) + ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY %3.sub0_sub1 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_64 = COPY killed %3.sub2_sub3 + ; + ; GFX11-LABEL: name: merge_s_load_x2ec_x2ec + ; GFX11: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF + ; GFX11-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 8) + ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_64 = COPY killed [[S_LOAD_DWORDX4_IMM]].sub2_sub3 + ; + ; GFX12-LABEL: name: merge_s_load_x2ec_x2ec + ; GFX12: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF + ; GFX12-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 8) + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_64 = COPY killed [[S_LOAD_DWORDX4_IMM]].sub2_sub3 + %0:sgpr_64 = IMPLICIT_DEF + early-clobber %1:sgpr_64 = S_LOAD_DWORDX2_IMM_ec %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s64)) + early-clobber %2:sgpr_64 = S_LOAD_DWORDX2_IMM_ec %0:sgpr_64, 8, 0 :: (dereferenceable invariant load (s64)) +... + +--- +name: merge_s_load_x2ec_x2ec_x2ec_x2ec +body: | + bb.0: + ; GFX10-LABEL: name: merge_s_load_x2ec_x2ec_x2ec_x2ec + ; GFX10: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF + ; GFX10-NEXT: early-clobber %7:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 8) + ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %7.sub0_sub1_sub2_sub3 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed %7.sub4_sub5_sub6_sub7 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_64 = COPY [[COPY]].sub0_sub1 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY killed [[COPY]].sub2_sub3 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY [[COPY1]].sub0_sub1 + ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY killed [[COPY1]].sub2_sub3 + ; + ; GFX11-LABEL: name: merge_s_load_x2ec_x2ec_x2ec_x2ec + ; GFX11: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF + ; GFX11-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 8) + ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed [[S_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_64 = COPY [[COPY]].sub0_sub1 + ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY killed [[COPY]].sub2_sub3 + ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY [[COPY1]].sub0_sub1 + ; GFX11-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY killed [[COPY1]].sub2_sub3 + ; + ; GFX12-LABEL: name: merge_s_load_x2ec_x2ec_x2ec_x2ec + ; GFX12: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF + ; GFX12-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 8) + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed [[S_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_64 = COPY [[COPY]].sub0_sub1 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY killed [[COPY]].sub2_sub3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY [[COPY1]].sub0_sub1 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY killed [[COPY1]].sub2_sub3 + %0:sgpr_64 = IMPLICIT_DEF + early-clobber %1:sgpr_64 = S_LOAD_DWORDX2_IMM_ec %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s64)) + early-clobber %2:sgpr_64 = S_LOAD_DWORDX2_IMM_ec %0:sgpr_64, 8, 0 :: (dereferenceable invariant load (s64)) + early-clobber %3:sgpr_64 = S_LOAD_DWORDX2_IMM_ec %0:sgpr_64, 16, 0 :: (dereferenceable invariant load (s64)) + early-clobber %4:sgpr_64 = S_LOAD_DWORDX2_IMM_ec %0:sgpr_64, 24, 0 :: (dereferenceable invariant load (s64)) +... + +--- +name: merge_s_load_x3ec_x1 +body: | + bb.0: + ; CHECK-LABEL: name: merge_s_load_x3ec_x1 + ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s128)) + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_96 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1_sub2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_LOAD_DWORDX4_IMM]].sub3 + %0:sgpr_64 = IMPLICIT_DEF + early-clobber %1:sgpr_96 = S_LOAD_DWORDX3_IMM_ec %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s96)) + %2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 12, 0 :: (dereferenceable invariant load (s32)) +... + +--- +name: merge_s_load_x4ec_x4ec +body: | + bb.0: + ; GFX10-LABEL: name: merge_s_load_x4ec_x4ec + ; GFX10: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF + ; GFX10-NEXT: early-clobber %3:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 16) + ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %3.sub0_sub1_sub2_sub3 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed %3.sub4_sub5_sub6_sub7 + ; + ; GFX11-LABEL: name: merge_s_load_x4ec_x4ec + ; GFX11: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF + ; GFX11-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 16) + ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed [[S_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7 + ; + ; GFX12-LABEL: name: merge_s_load_x4ec_x4ec + ; GFX12: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF + ; GFX12-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 16) + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed [[S_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7 + %0:sgpr_64 = IMPLICIT_DEF + early-clobber %1:sgpr_128 = S_LOAD_DWORDX4_IMM_ec %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s128)) + early-clobber %2:sgpr_128 = S_LOAD_DWORDX4_IMM_ec %0:sgpr_64, 16, 0 :: (dereferenceable invariant load (s128)) +... diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll index 9dafa27ece86f..a77892c8f5fc7 100644 --- a/llvm/test/CodeGen/AMDGPU/min.ll +++ b/llvm/test/CodeGen/AMDGPU/min.ll @@ -31,8 +31,8 @@ define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrsp ; ; CI-LABEL: v_test_imin_sle_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -53,8 +53,8 @@ define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: v_test_imin_sle_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -75,12 +75,12 @@ define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_test_imin_sle_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_i32_e32 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] @@ -89,13 +89,13 @@ define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrsp ; GFX10-LABEL: v_test_imin_sle_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-NEXT: global_load_dword v2, v0, s[6:7] +; GFX10-NEXT: global_load_dword v2, v0, s[4:5] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_min_i32_e32 v1, v1, v2 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] @@ -104,8 +104,10 @@ define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrsp ; GFX11-LABEL: v_test_imin_sle_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -143,7 +145,7 @@ define amdgpu_kernel void @s_test_imin_sle_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; CI-LABEL: s_test_imin_sle_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_i32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -154,7 +156,7 @@ define amdgpu_kernel void @s_test_imin_sle_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; VI-LABEL: s_test_imin_sle_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_i32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -165,7 +167,7 @@ define amdgpu_kernel void @s_test_imin_sle_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX9-LABEL: s_test_imin_sle_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_min_i32 s2, s2, s3 @@ -175,7 +177,7 @@ define amdgpu_kernel void @s_test_imin_sle_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX10-LABEL: s_test_imin_sle_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_min_i32 s2, s2, s3 @@ -185,7 +187,7 @@ define amdgpu_kernel void @s_test_imin_sle_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX11-LABEL: s_test_imin_sle_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_min_i32 s2, s2, s3 @@ -215,7 +217,7 @@ define amdgpu_kernel void @s_test_imin_sle_v1i32(ptr addrspace(1) %out, <1 x i32 ; ; CI-LABEL: s_test_imin_sle_v1i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_i32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -226,7 +228,7 @@ define amdgpu_kernel void @s_test_imin_sle_v1i32(ptr addrspace(1) %out, <1 x i32 ; ; VI-LABEL: s_test_imin_sle_v1i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_i32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -237,7 +239,7 @@ define amdgpu_kernel void @s_test_imin_sle_v1i32(ptr addrspace(1) %out, <1 x i32 ; ; GFX9-LABEL: s_test_imin_sle_v1i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_min_i32 s2, s2, s3 @@ -247,7 +249,7 @@ define amdgpu_kernel void @s_test_imin_sle_v1i32(ptr addrspace(1) %out, <1 x i32 ; ; GFX10-LABEL: s_test_imin_sle_v1i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_min_i32 s2, s2, s3 @@ -257,7 +259,7 @@ define amdgpu_kernel void @s_test_imin_sle_v1i32(ptr addrspace(1) %out, <1 x i32 ; ; GFX11-LABEL: s_test_imin_sle_v1i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_min_i32 s2, s2, s3 @@ -290,8 +292,8 @@ define amdgpu_kernel void @s_test_imin_sle_v4i32(ptr addrspace(1) %out, <4 x i32 ; ; CI-LABEL: s_test_imin_sle_v4i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x4 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_i32 s2, s11, s15 ; CI-NEXT: s_min_i32 s3, s10, s14 @@ -308,8 +310,8 @@ define amdgpu_kernel void @s_test_imin_sle_v4i32(ptr addrspace(1) %out, <4 x i32 ; ; VI-LABEL: s_test_imin_sle_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_i32 s2, s11, s15 ; VI-NEXT: s_min_i32 s3, s10, s14 @@ -326,8 +328,8 @@ define amdgpu_kernel void @s_test_imin_sle_v4i32(ptr addrspace(1) %out, <4 x i32 ; ; GFX9-LABEL: s_test_imin_sle_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_min_i32 s2, s11, s15 @@ -344,8 +346,8 @@ define amdgpu_kernel void @s_test_imin_sle_v4i32(ptr addrspace(1) %out, <4 x i32 ; GFX10-LABEL: s_test_imin_sle_v4i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_min_i32 s2, s11, s15 @@ -362,8 +364,8 @@ define amdgpu_kernel void @s_test_imin_sle_v4i32(ptr addrspace(1) %out, <4 x i32 ; GFX11-LABEL: s_test_imin_sle_v4i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x10 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x10 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_min_i32 s2, s7, s11 @@ -417,9 +419,9 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32], ; ; CI-LABEL: s_test_imin_sle_i8: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[4:5], 0xa -; CI-NEXT: s_load_dword s3, s[4:5], 0x13 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dword s2, s[6:7], 0xa +; CI-NEXT: s_load_dword s3, s[6:7], 0x13 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_sext_i32_i8 s2, s2 ; CI-NEXT: s_sext_i32_i8 s3, s3 @@ -432,9 +434,9 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32], ; ; VI-LABEL: s_test_imin_sle_i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x28 -; VI-NEXT: s_load_dword s3, s[4:5], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x28 +; VI-NEXT: s_load_dword s3, s[6:7], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_sext_i32_i8 s2, s2 ; VI-NEXT: s_sext_i32_i8 s3, s3 @@ -447,9 +449,9 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32], ; ; GFX9-LABEL: s_test_imin_sle_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x28 -; GFX9-NEXT: s_load_dword s3, s[4:5], 0x4c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x28 +; GFX9-NEXT: s_load_dword s3, s[6:7], 0x4c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sext_i32_i8 s2, s2 @@ -462,9 +464,9 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32], ; GFX10-LABEL: s_test_imin_sle_i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s2, s[4:5], 0x28 -; GFX10-NEXT: s_load_dword s3, s[4:5], 0x4c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dword s2, s[6:7], 0x28 +; GFX10-NEXT: s_load_dword s3, s[6:7], 0x4c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_sext_i32_i8 s2, s2 @@ -477,13 +479,13 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32], ; GFX11-LABEL: s_test_imin_sle_i8: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x28 -; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x4c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x28 +; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x4c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_sext_i32_i8 s2, s2 -; GFX11-NEXT: s_sext_i32_i8 s3, s3 +; GFX11-NEXT: s_sext_i32_i8 s2, s4 +; GFX11-NEXT: s_sext_i32_i8 s3, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_min_i32 s2, s2, s3 ; GFX11-NEXT: v_mov_b32_e32 v1, s2 @@ -554,9 +556,9 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32] ; ; CI-LABEL: s_test_imin_sle_v4i8: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[4:5], 0xa -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CI-NEXT: s_load_dword s3, s[4:5], 0x13 +; CI-NEXT: s_load_dword s2, s[6:7], 0xa +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s3, s[6:7], 0x13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_ashr_i32 s4, s2, 24 ; CI-NEXT: s_sext_i32_i8 s5, s2 @@ -587,9 +589,9 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32] ; ; VI-LABEL: s_test_imin_sle_v4i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x28 -; VI-NEXT: s_load_dword s3, s[4:5], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x28 +; VI-NEXT: s_load_dword s3, s[6:7], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_lshrrev_b16_e64 v0, 8, s2 ; VI-NEXT: v_lshrrev_b16_e64 v1, 8, s3 @@ -616,9 +618,9 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32] ; ; GFX9-LABEL: s_test_imin_sle_v4i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x28 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s3, s[4:5], 0x4c +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x28 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s3, s[6:7], 0x4c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s4, s2, 16 @@ -644,9 +646,9 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32] ; GFX10-LABEL: s_test_imin_sle_v4i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s2, s[4:5], 0x28 -; GFX10-NEXT: s_load_dword s3, s[4:5], 0x4c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dword s2, s[6:7], 0x28 +; GFX10-NEXT: s_load_dword s3, s[6:7], 0x4c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_lshr_b32 s4, s2, 16 ; GFX10-NEXT: s_lshr_b32 s5, s3, 16 @@ -673,29 +675,27 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32] ; ; GFX11-LABEL: s_test_imin_sle_v4i8: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x28 -; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x4c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x28 +; GFX11-NEXT: s_load_b32 s1, s[2:3], 0x4c ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s4, s2, 16 -; GFX11-NEXT: s_lshr_b32 s5, s3, 16 -; GFX11-NEXT: v_ashrrev_i16 v0, 8, s2 -; GFX11-NEXT: v_ashrrev_i16 v1, 8, s3 +; GFX11-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-NEXT: s_lshr_b32 s5, s1, 16 +; GFX11-NEXT: v_ashrrev_i16 v0, 8, s0 +; GFX11-NEXT: v_ashrrev_i16 v1, 8, s1 ; GFX11-NEXT: v_ashrrev_i16 v2, 8, s4 ; GFX11-NEXT: v_ashrrev_i16 v3, 8, s5 -; GFX11-NEXT: s_bfe_i32 s2, s2, 0x80000 -; GFX11-NEXT: s_bfe_i32 s3, s3, 0x80000 +; GFX11-NEXT: s_bfe_i32 s0, s0, 0x80000 +; GFX11-NEXT: s_bfe_i32 s1, s1, 0x80000 ; GFX11-NEXT: s_bfe_i32 s4, s4, 0x80000 ; GFX11-NEXT: s_bfe_i32 s5, s5, 0x80000 -; GFX11-NEXT: v_min_i16 v4, s2, s3 +; GFX11-NEXT: v_min_i16 v4, s0, s1 ; GFX11-NEXT: v_min_i16 v5, s4, s5 ; GFX11-NEXT: v_min_i16 v2, v2, v3 ; GFX11-NEXT: v_min_i16 v0, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v4 ; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_lshlrev_b16 v2, 8, v2 ; GFX11-NEXT: v_lshlrev_b16 v0, 8, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -707,6 +707,7 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32] ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -751,7 +752,7 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16 ; ; CI-LABEL: s_test_imin_sle_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_ashr_i32 s4, s2, 16 ; CI-NEXT: s_sext_i32_i16 s2, s2 @@ -770,7 +771,7 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16 ; ; VI-LABEL: s_test_imin_sle_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s4, s2, 16 ; VI-NEXT: s_sext_i32_i16 s2, s2 @@ -789,7 +790,7 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16 ; ; GFX9-LABEL: s_test_imin_sle_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 @@ -799,7 +800,7 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16 ; ; GFX10-LABEL: s_test_imin_sle_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_pk_min_i16 v1, s2, s3 @@ -808,7 +809,7 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16 ; ; GFX11-LABEL: s_test_imin_sle_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_min_i16 v1, s2, s3 @@ -903,8 +904,8 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16 ; ; CI-LABEL: s_test_imin_sle_v4i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 -; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 +; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_ashr_i32 s6, s0, 16 ; CI-NEXT: s_ashr_i32 s7, s1, 16 @@ -933,8 +934,8 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16 ; ; VI-LABEL: s_test_imin_sle_v4i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s6, s1, 16 ; VI-NEXT: s_sext_i32_i16 s1, s1 @@ -963,34 +964,34 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16 ; ; GFX9-LABEL: s_test_imin_sle_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: v_pk_min_i16 v1, s1, v0 ; GFX9-NEXT: v_pk_min_i16 v0, s0, v3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_test_imin_sle_v4i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_pk_min_i16 v1, s1, s3 ; GFX10-NEXT: v_pk_min_i16 v0, s0, s2 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_imin_sle_v4i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_min_i16 v1, s5, s7 @@ -1030,8 +1031,8 @@ define amdgpu_kernel void @v_test_imin_slt_i32(ptr addrspace(1) %out, ptr addrsp ; ; CI-LABEL: v_test_imin_slt_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1052,8 +1053,8 @@ define amdgpu_kernel void @v_test_imin_slt_i32(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: v_test_imin_slt_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1074,12 +1075,12 @@ define amdgpu_kernel void @v_test_imin_slt_i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_test_imin_slt_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_i32_e32 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] @@ -1088,13 +1089,13 @@ define amdgpu_kernel void @v_test_imin_slt_i32(ptr addrspace(1) %out, ptr addrsp ; GFX10-LABEL: v_test_imin_slt_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-NEXT: global_load_dword v2, v0, s[6:7] +; GFX10-NEXT: global_load_dword v2, v0, s[4:5] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_min_i32_e32 v1, v1, v2 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] @@ -1103,8 +1104,10 @@ define amdgpu_kernel void @v_test_imin_slt_i32(ptr addrspace(1) %out, ptr addrsp ; GFX11-LABEL: v_test_imin_slt_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -1169,8 +1172,8 @@ define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrsp ; ; CI-LABEL: v_test_imin_slt_i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 1, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1191,8 +1194,8 @@ define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: v_test_imin_slt_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1213,12 +1216,12 @@ define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_test_imin_slt_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] -; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] +; GFX9-NEXT: global_load_ushort v2, v0, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_i16_e32 v1, v1, v2 ; GFX9-NEXT: global_store_short v0, v1, s[0:1] @@ -1227,13 +1230,13 @@ define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrsp ; GFX10-LABEL: v_test_imin_slt_i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] -; GFX10-NEXT: global_load_ushort v2, v0, s[6:7] +; GFX10-NEXT: global_load_ushort v2, v0, s[4:5] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_min_i16 v1, v1, v2 ; GFX10-NEXT: global_store_short v0, v1, s[0:1] @@ -1242,8 +1245,10 @@ define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrsp ; GFX11-LABEL: v_test_imin_slt_i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -1282,7 +1287,7 @@ define amdgpu_kernel void @s_test_imin_slt_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; CI-LABEL: s_test_imin_slt_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_i32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -1293,7 +1298,7 @@ define amdgpu_kernel void @s_test_imin_slt_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; VI-LABEL: s_test_imin_slt_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_i32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1304,7 +1309,7 @@ define amdgpu_kernel void @s_test_imin_slt_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX9-LABEL: s_test_imin_slt_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_min_i32 s2, s2, s3 @@ -1314,7 +1319,7 @@ define amdgpu_kernel void @s_test_imin_slt_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX10-LABEL: s_test_imin_slt_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_min_i32 s2, s2, s3 @@ -1324,7 +1329,7 @@ define amdgpu_kernel void @s_test_imin_slt_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX11-LABEL: s_test_imin_slt_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_min_i32 s2, s2, s3 @@ -1355,8 +1360,8 @@ define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32 ; ; CI-LABEL: s_test_imin_slt_v2i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 -; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 +; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_i32 s1, s1, s3 ; CI-NEXT: s_min_i32 s0, s0, s2 @@ -1369,8 +1374,8 @@ define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32 ; ; VI-LABEL: s_test_imin_slt_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_i32 s1, s1, s3 ; VI-NEXT: s_min_i32 s0, s0, s2 @@ -1383,36 +1388,36 @@ define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32 ; ; GFX9-LABEL: s_test_imin_slt_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_min_i32 s1, s1, s3 ; GFX9-NEXT: s_min_i32 s0, s0, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_test_imin_slt_v2i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_min_i32 s0, s0, s2 ; GFX10-NEXT: s_min_i32 s1, s1, s3 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_imin_slt_v2i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_min_i32 s2, s4, s6 @@ -1443,8 +1448,8 @@ define amdgpu_kernel void @s_test_imin_slt_imm_i32(ptr addrspace(1) %out, i32 %a ; ; CI-LABEL: s_test_imin_slt_imm_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[4:5], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dword s2, s[6:7], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_i32 s2, s2, 8 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -1455,8 +1460,8 @@ define amdgpu_kernel void @s_test_imin_slt_imm_i32(ptr addrspace(1) %out, i32 %a ; ; VI-LABEL: s_test_imin_slt_imm_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_i32 s2, s2, 8 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1467,8 +1472,8 @@ define amdgpu_kernel void @s_test_imin_slt_imm_i32(ptr addrspace(1) %out, i32 %a ; ; GFX9-LABEL: s_test_imin_slt_imm_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_min_i32 s2, s2, 8 @@ -1479,8 +1484,8 @@ define amdgpu_kernel void @s_test_imin_slt_imm_i32(ptr addrspace(1) %out, i32 %a ; GFX10-LABEL: s_test_imin_slt_imm_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_min_i32 s2, s2, 8 @@ -1491,11 +1496,11 @@ define amdgpu_kernel void @s_test_imin_slt_imm_i32(ptr addrspace(1) %out, i32 %a ; GFX11-LABEL: s_test_imin_slt_imm_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_min_i32 s2, s2, 8 +; GFX11-NEXT: s_min_i32 s2, s4, 8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1522,8 +1527,8 @@ define amdgpu_kernel void @s_test_imin_sle_imm_i32(ptr addrspace(1) %out, i32 %a ; ; CI-LABEL: s_test_imin_sle_imm_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[4:5], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dword s2, s[6:7], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_i32 s2, s2, 8 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -1534,8 +1539,8 @@ define amdgpu_kernel void @s_test_imin_sle_imm_i32(ptr addrspace(1) %out, i32 %a ; ; VI-LABEL: s_test_imin_sle_imm_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_i32 s2, s2, 8 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1546,8 +1551,8 @@ define amdgpu_kernel void @s_test_imin_sle_imm_i32(ptr addrspace(1) %out, i32 %a ; ; GFX9-LABEL: s_test_imin_sle_imm_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_min_i32 s2, s2, 8 @@ -1558,8 +1563,8 @@ define amdgpu_kernel void @s_test_imin_sle_imm_i32(ptr addrspace(1) %out, i32 %a ; GFX10-LABEL: s_test_imin_sle_imm_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_min_i32 s2, s2, 8 @@ -1570,11 +1575,11 @@ define amdgpu_kernel void @s_test_imin_sle_imm_i32(ptr addrspace(1) %out, i32 %a ; GFX11-LABEL: s_test_imin_sle_imm_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_min_i32 s2, s2, 8 +; GFX11-NEXT: s_min_i32 s2, s4, 8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1612,8 +1617,8 @@ define amdgpu_kernel void @v_test_umin_ule_i32(ptr addrspace(1) %out, ptr addrsp ; ; CI-LABEL: v_test_umin_ule_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1634,8 +1639,8 @@ define amdgpu_kernel void @v_test_umin_ule_i32(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: v_test_umin_ule_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1656,12 +1661,12 @@ define amdgpu_kernel void @v_test_umin_ule_i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_test_umin_ule_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_u32_e32 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] @@ -1670,13 +1675,13 @@ define amdgpu_kernel void @v_test_umin_ule_i32(ptr addrspace(1) %out, ptr addrsp ; GFX10-LABEL: v_test_umin_ule_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-NEXT: global_load_dword v2, v0, s[6:7] +; GFX10-NEXT: global_load_dword v2, v0, s[4:5] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_min_u32_e32 v1, v1, v2 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] @@ -1685,8 +1690,10 @@ define amdgpu_kernel void @v_test_umin_ule_i32(ptr addrspace(1) %out, ptr addrsp ; GFX11-LABEL: v_test_umin_ule_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -1741,8 +1748,8 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addr ; ; CI-LABEL: v_test_umin_ule_v3i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v6, 4, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1765,8 +1772,8 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addr ; ; VI-LABEL: v_test_umin_ule_v3i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v6, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1789,12 +1796,12 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: v_test_umin_ule_v3i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 4, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx3 v[0:2], v6, s[2:3] -; GFX9-NEXT: global_load_dwordx3 v[3:5], v6, s[6:7] +; GFX9-NEXT: global_load_dwordx3 v[3:5], v6, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_u32_e32 v2, v2, v5 ; GFX9-NEXT: v_min_u32_e32 v1, v1, v4 @@ -1805,13 +1812,13 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addr ; GFX10-LABEL: v_test_umin_ule_v3i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 4, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx3 v[0:2], v6, s[2:3] -; GFX10-NEXT: global_load_dwordx3 v[3:5], v6, s[6:7] +; GFX10-NEXT: global_load_dwordx3 v[3:5], v6, s[4:5] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_min_u32_e32 v2, v2, v5 ; GFX10-NEXT: v_min_u32_e32 v1, v1, v4 @@ -1822,8 +1829,10 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addr ; GFX11-LABEL: v_test_umin_ule_v3i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 4, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -1902,8 +1911,8 @@ define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addr ; ; CI-LABEL: v_test_umin_ule_v3i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1938,8 +1947,8 @@ define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addr ; ; VI-LABEL: v_test_umin_ule_v3i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1966,12 +1975,12 @@ define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: v_test_umin_ule_v3i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_min_u16 v1, v1, v3 ; GFX9-NEXT: v_pk_min_u16 v0, v0, v2 @@ -1982,13 +1991,13 @@ define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addr ; GFX10-LABEL: v_test_umin_ule_v3i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_min_u16 v1, v1, v3 ; GFX10-NEXT: v_pk_min_u16 v0, v0, v2 @@ -1999,8 +2008,10 @@ define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addr ; GFX11-LABEL: v_test_umin_ule_v3i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -2042,7 +2053,7 @@ define amdgpu_kernel void @s_test_umin_ule_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; CI-LABEL: s_test_umin_ule_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_u32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -2053,7 +2064,7 @@ define amdgpu_kernel void @s_test_umin_ule_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; VI-LABEL: s_test_umin_ule_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_u32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2064,7 +2075,7 @@ define amdgpu_kernel void @s_test_umin_ule_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX9-LABEL: s_test_umin_ule_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_min_u32 s2, s2, s3 @@ -2074,7 +2085,7 @@ define amdgpu_kernel void @s_test_umin_ule_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX10-LABEL: s_test_umin_ule_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_min_u32 s2, s2, s3 @@ -2084,7 +2095,7 @@ define amdgpu_kernel void @s_test_umin_ule_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX11-LABEL: s_test_umin_ule_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_min_u32 s2, s2, s3 @@ -2125,8 +2136,8 @@ define amdgpu_kernel void @v_test_umin_ult_i32(ptr addrspace(1) %out, ptr addrsp ; ; CI-LABEL: v_test_umin_ult_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2147,8 +2158,8 @@ define amdgpu_kernel void @v_test_umin_ult_i32(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: v_test_umin_ult_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2169,12 +2180,12 @@ define amdgpu_kernel void @v_test_umin_ult_i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_test_umin_ult_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_u32_e32 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] @@ -2183,13 +2194,13 @@ define amdgpu_kernel void @v_test_umin_ult_i32(ptr addrspace(1) %out, ptr addrsp ; GFX10-LABEL: v_test_umin_ult_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-NEXT: global_load_dword v2, v0, s[6:7] +; GFX10-NEXT: global_load_dword v2, v0, s[4:5] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_min_u32_e32 v1, v1, v2 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] @@ -2198,8 +2209,10 @@ define amdgpu_kernel void @v_test_umin_ult_i32(ptr addrspace(1) %out, ptr addrsp ; GFX11-LABEL: v_test_umin_ult_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -2255,8 +2268,8 @@ define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspa ; ; CI-LABEL: v_test_umin_ult_i8: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v2, s3 ; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v0 @@ -2276,8 +2289,8 @@ define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspa ; ; VI-LABEL: v_test_umin_ult_i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s3 ; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v0 @@ -2297,11 +2310,11 @@ define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: v_test_umin_ult_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3] -; GFX9-NEXT: global_load_ubyte v2, v0, s[6:7] +; GFX9-NEXT: global_load_ubyte v2, v0, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_u16_e32 v1, v1, v2 ; GFX9-NEXT: global_store_byte v0, v1, s[0:1] @@ -2310,12 +2323,12 @@ define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspa ; GFX10-LABEL: v_test_umin_ult_i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] -; GFX10-NEXT: global_load_ubyte v2, v0, s[6:7] +; GFX10-NEXT: global_load_ubyte v2, v0, s[4:5] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_min_u16 v1, v1, v2 ; GFX10-NEXT: global_store_byte v0, v1, s[0:1] @@ -2324,8 +2337,9 @@ define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspa ; GFX11-LABEL: v_test_umin_ult_i8: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_u8 v1, v0, s[6:7] @@ -2363,7 +2377,7 @@ define amdgpu_kernel void @s_test_umin_ult_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; CI-LABEL: s_test_umin_ult_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_u32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -2374,7 +2388,7 @@ define amdgpu_kernel void @s_test_umin_ult_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; VI-LABEL: s_test_umin_ult_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_u32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2385,7 +2399,7 @@ define amdgpu_kernel void @s_test_umin_ult_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX9-LABEL: s_test_umin_ult_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_min_u32 s2, s2, s3 @@ -2395,7 +2409,7 @@ define amdgpu_kernel void @s_test_umin_ult_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX10-LABEL: s_test_umin_ult_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_min_u32 s2, s2, s3 @@ -2405,7 +2419,7 @@ define amdgpu_kernel void @s_test_umin_ult_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX11-LABEL: s_test_umin_ult_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_min_u32 s2, s2, s3 @@ -2457,7 +2471,7 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0, ; ; CI-LABEL: v_test_umin_ult_i32_multi_use: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s4, s[4:5], 0x0 ; CI-NEXT: s_load_dword s5, s[6:7], 0x0 @@ -2478,7 +2492,7 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0, ; ; VI-LABEL: v_test_umin_ult_i32_multi_use: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s4, s[4:5], 0x0 ; VI-NEXT: s_load_dword s5, s[6:7], 0x0 @@ -2499,7 +2513,7 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0, ; ; GFX9-LABEL: v_test_umin_ult_i32_multi_use: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s8, s[4:5], 0x0 @@ -2517,7 +2531,7 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0, ; ; GFX10-LABEL: v_test_umin_ult_i32_multi_use: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dword s8, s[4:5], 0x0 @@ -2535,7 +2549,7 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0, ; ; GFX11-LABEL: v_test_umin_ult_i32_multi_use: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x0 @@ -2607,7 +2621,7 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0, ; ; CI-LABEL: v_test_umin_ult_i16_multi_use: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: v_mov_b32_e32 v1, s5 @@ -2629,7 +2643,7 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0, ; ; VI-LABEL: v_test_umin_ult_i16_multi_use: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -2651,7 +2665,7 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0, ; ; GFX9-LABEL: v_test_umin_ult_i16_multi_use: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[4:5] @@ -2666,7 +2680,7 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0, ; ; GFX10-LABEL: v_test_umin_ult_i16_multi_use: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 @@ -2682,7 +2696,7 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0, ; ; GFX11-LABEL: v_test_umin_ult_i16_multi_use: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -2721,7 +2735,7 @@ define amdgpu_kernel void @s_test_umin_ult_v1i32(ptr addrspace(1) %out, <1 x i32 ; ; CI-LABEL: s_test_umin_ult_v1i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_u32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -2732,7 +2746,7 @@ define amdgpu_kernel void @s_test_umin_ult_v1i32(ptr addrspace(1) %out, <1 x i32 ; ; VI-LABEL: s_test_umin_ult_v1i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_u32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2743,7 +2757,7 @@ define amdgpu_kernel void @s_test_umin_ult_v1i32(ptr addrspace(1) %out, <1 x i32 ; ; GFX9-LABEL: s_test_umin_ult_v1i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_min_u32 s2, s2, s3 @@ -2753,7 +2767,7 @@ define amdgpu_kernel void @s_test_umin_ult_v1i32(ptr addrspace(1) %out, <1 x i32 ; ; GFX10-LABEL: s_test_umin_ult_v1i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_min_u32 s2, s2, s3 @@ -2763,7 +2777,7 @@ define amdgpu_kernel void @s_test_umin_ult_v1i32(ptr addrspace(1) %out, <1 x i32 ; ; GFX11-LABEL: s_test_umin_ult_v1i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_min_u32 s2, s2, s3 @@ -2804,8 +2818,8 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32 ; ; CI-LABEL: s_test_umin_ult_v8i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x8 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x8 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_u32 s4, s11, s19 ; CI-NEXT: s_min_u32 s5, s10, s18 @@ -2835,8 +2849,8 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32 ; ; VI-LABEL: s_test_umin_ult_v8i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x20 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x20 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_u32 s4, s11, s19 ; VI-NEXT: s_min_u32 s5, s10, s18 @@ -2866,8 +2880,8 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32 ; ; GFX9-LABEL: s_test_umin_ult_v8i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x20 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x20 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_min_u32 s4, s9, s17 @@ -2894,8 +2908,8 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32 ; GFX10-LABEL: s_test_umin_ult_v8i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x20 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x20 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_min_u32 s4, s9, s17 @@ -2921,8 +2935,8 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32 ; GFX11-LABEL: s_test_umin_ult_v8i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b512 s[4:19], s[0:1], 0x20 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b512 s[4:19], s[2:3], 0x20 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v8, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_min_u32 s2, s7, s15 @@ -3095,8 +3109,8 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16 ; ; CI-LABEL: s_test_umin_ult_v8i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x4 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s2, s8, 16 ; CI-NEXT: s_and_b32 s3, s8, 0xffff @@ -3141,8 +3155,8 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16 ; ; VI-LABEL: s_test_umin_ult_v8i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s2, s11, 16 ; VI-NEXT: s_lshr_b32 s4, s10, 16 @@ -3187,8 +3201,8 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16 ; ; GFX9-LABEL: s_test_umin_ult_v8i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s15 @@ -3205,8 +3219,8 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16 ; GFX10-LABEL: s_test_umin_ult_v8i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_pk_min_u16 v3, s11, s15 @@ -3219,8 +3233,8 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16 ; GFX11-LABEL: s_test_umin_ult_v8i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x10 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x10 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_min_u16 v3, s7, s11 @@ -3263,9 +3277,9 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(ptr addrspac ; ; CI-LABEL: simplify_demanded_bits_test_umin_ult_i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[4:5], 0xa -; CI-NEXT: s_load_dword s3, s[4:5], 0x13 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dword s2, s[6:7], 0xa +; CI-NEXT: s_load_dword s3, s[6:7], 0x13 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s2, s2, 0xffff ; CI-NEXT: s_and_b32 s3, s3, 0xffff @@ -3278,9 +3292,9 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(ptr addrspac ; ; VI-LABEL: simplify_demanded_bits_test_umin_ult_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x28 -; VI-NEXT: s_load_dword s3, s[4:5], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x28 +; VI-NEXT: s_load_dword s3, s[6:7], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s2, s2, 0xffff ; VI-NEXT: s_and_b32 s3, s3, 0xffff @@ -3293,9 +3307,9 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(ptr addrspac ; ; GFX9-LABEL: simplify_demanded_bits_test_umin_ult_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x28 -; GFX9-NEXT: s_load_dword s3, s[4:5], 0x4c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x28 +; GFX9-NEXT: s_load_dword s3, s[6:7], 0x4c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0xffff @@ -3308,9 +3322,9 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(ptr addrspac ; GFX10-LABEL: simplify_demanded_bits_test_umin_ult_i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s2, s[4:5], 0x28 -; GFX10-NEXT: s_load_dword s3, s[4:5], 0x4c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dword s2, s[6:7], 0x28 +; GFX10-NEXT: s_load_dword s3, s[6:7], 0x4c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s2, s2, 0xffff @@ -3323,13 +3337,13 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(ptr addrspac ; GFX11-LABEL: simplify_demanded_bits_test_umin_ult_i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x28 -; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x4c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x28 +; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x4c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-NEXT: s_and_b32 s2, s4, 0xffff +; GFX11-NEXT: s_and_b32 s3, s5, 0xffff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_min_u32 s2, s2, s3 ; GFX11-NEXT: v_mov_b32_e32 v1, s2 @@ -3372,9 +3386,9 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace ; ; CI-LABEL: simplify_demanded_bits_test_min_slt_i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[4:5], 0xa -; CI-NEXT: s_load_dword s3, s[4:5], 0x13 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dword s2, s[6:7], 0xa +; CI-NEXT: s_load_dword s3, s[6:7], 0x13 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_sext_i32_i16 s2, s2 ; CI-NEXT: s_sext_i32_i16 s3, s3 @@ -3387,9 +3401,9 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace ; ; VI-LABEL: simplify_demanded_bits_test_min_slt_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x28 -; VI-NEXT: s_load_dword s3, s[4:5], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x28 +; VI-NEXT: s_load_dword s3, s[6:7], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_sext_i32_i16 s2, s2 ; VI-NEXT: s_sext_i32_i16 s3, s3 @@ -3402,9 +3416,9 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace ; ; GFX9-LABEL: simplify_demanded_bits_test_min_slt_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x28 -; GFX9-NEXT: s_load_dword s3, s[4:5], 0x4c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x28 +; GFX9-NEXT: s_load_dword s3, s[6:7], 0x4c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sext_i32_i16 s2, s2 @@ -3417,9 +3431,9 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace ; GFX10-LABEL: simplify_demanded_bits_test_min_slt_i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s2, s[4:5], 0x28 -; GFX10-NEXT: s_load_dword s3, s[4:5], 0x4c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dword s2, s[6:7], 0x28 +; GFX10-NEXT: s_load_dword s3, s[6:7], 0x4c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_sext_i32_i16 s2, s2 @@ -3432,13 +3446,13 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace ; GFX11-LABEL: simplify_demanded_bits_test_min_slt_i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x28 -; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x4c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x28 +; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x4c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_sext_i32_i16 s2, s2 -; GFX11-NEXT: s_sext_i32_i16 s3, s3 +; GFX11-NEXT: s_sext_i32_i16 s2, s4 +; GFX11-NEXT: s_sext_i32_i16 s3, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_min_i32 s2, s2, s3 ; GFX11-NEXT: v_mov_b32_e32 v1, s2 @@ -3489,8 +3503,8 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1 ; ; CI-LABEL: s_test_imin_sle_i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[4:5], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dword s2, s[6:7], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_sext_i32_i16 s3, s2 ; CI-NEXT: s_ashr_i32 s2, s2, 16 @@ -3503,8 +3517,8 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1 ; ; VI-LABEL: s_test_imin_sle_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_sext_i32_i16 s3, s2 ; VI-NEXT: s_ashr_i32 s2, s2, 16 @@ -3517,8 +3531,8 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1 ; ; GFX9-LABEL: s_test_imin_sle_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sext_i32_i16 s3, s2 @@ -3531,8 +3545,8 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1 ; GFX10-LABEL: s_test_imin_sle_i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_sext_i32_i16 s3, s2 @@ -3545,14 +3559,14 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1 ; GFX11-LABEL: s_test_imin_sle_i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_sext_i32_i16 s3, s2 -; GFX11-NEXT: s_ashr_i32 s2, s2, 16 +; GFX11-NEXT: s_sext_i32_i16 s2, s4 +; GFX11-NEXT: s_ashr_i32 s3, s4, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_min_i32 s2, s3, s2 +; GFX11-NEXT: s_min_i32 s2, s2, s3 ; GFX11-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -3585,8 +3599,8 @@ define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; CI-LABEL: test_umin_ult_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s4 @@ -3603,8 +3617,8 @@ define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; VI-LABEL: test_umin_ult_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s4 @@ -3621,16 +3635,16 @@ define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; GFX9-LABEL: test_umin_ult_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: s_cselect_b32 s3, s3, s7 -; GFX9-NEXT: s_cselect_b32 s2, s2, s6 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s3, s3, s5 +; GFX9-NEXT: s_cselect_b32 s2, s2, s4 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -3639,14 +3653,14 @@ define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64 ; GFX10-LABEL: test_umin_ult_i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[2:3], s[6:7] -; GFX10-NEXT: s_and_b32 s4, s4, exec_lo -; GFX10-NEXT: s_cselect_b32 s2, s2, s6 -; GFX10-NEXT: s_cselect_b32 s3, s3, s7 +; GFX10-NEXT: v_cmp_lt_u64_e64 s6, s[2:3], s[4:5] +; GFX10-NEXT: s_and_b32 s6, s6, exec_lo +; GFX10-NEXT: s_cselect_b32 s2, s2, s4 +; GFX10-NEXT: s_cselect_b32 s3, s3, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -3655,8 +3669,8 @@ define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64 ; GFX11-LABEL: test_umin_ult_i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_lt_u64_e64 s2, s[6:7], s[0:1] @@ -3695,8 +3709,8 @@ define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; CI-LABEL: test_umin_ule_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s4 @@ -3713,8 +3727,8 @@ define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; VI-LABEL: test_umin_ule_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s4 @@ -3731,16 +3745,16 @@ define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; GFX9-LABEL: test_umin_ule_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_cmp_le_u64_e32 vcc, s[2:3], v[0:1] -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: s_cselect_b32 s3, s3, s7 -; GFX9-NEXT: s_cselect_b32 s2, s2, s6 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s3, s3, s5 +; GFX9-NEXT: s_cselect_b32 s2, s2, s4 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -3749,14 +3763,14 @@ define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64 ; GFX10-LABEL: test_umin_ule_i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cmp_le_u64_e64 s4, s[2:3], s[6:7] -; GFX10-NEXT: s_and_b32 s4, s4, exec_lo -; GFX10-NEXT: s_cselect_b32 s2, s2, s6 -; GFX10-NEXT: s_cselect_b32 s3, s3, s7 +; GFX10-NEXT: v_cmp_le_u64_e64 s6, s[2:3], s[4:5] +; GFX10-NEXT: s_and_b32 s6, s6, exec_lo +; GFX10-NEXT: s_cselect_b32 s2, s2, s4 +; GFX10-NEXT: s_cselect_b32 s3, s3, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -3765,8 +3779,8 @@ define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64 ; GFX11-LABEL: test_umin_ule_i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_le_u64_e64 s2, s[6:7], s[0:1] @@ -3805,8 +3819,8 @@ define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; CI-LABEL: test_imin_slt_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s4 @@ -3823,8 +3837,8 @@ define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; VI-LABEL: test_imin_slt_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s4 @@ -3841,16 +3855,16 @@ define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; GFX9-LABEL: test_imin_slt_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: s_cselect_b32 s3, s3, s7 -; GFX9-NEXT: s_cselect_b32 s2, s2, s6 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s3, s3, s5 +; GFX9-NEXT: s_cselect_b32 s2, s2, s4 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -3859,14 +3873,14 @@ define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64 ; GFX10-LABEL: test_imin_slt_i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cmp_lt_i64_e64 s4, s[2:3], s[6:7] -; GFX10-NEXT: s_and_b32 s4, s4, exec_lo -; GFX10-NEXT: s_cselect_b32 s2, s2, s6 -; GFX10-NEXT: s_cselect_b32 s3, s3, s7 +; GFX10-NEXT: v_cmp_lt_i64_e64 s6, s[2:3], s[4:5] +; GFX10-NEXT: s_and_b32 s6, s6, exec_lo +; GFX10-NEXT: s_cselect_b32 s2, s2, s4 +; GFX10-NEXT: s_cselect_b32 s3, s3, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -3875,8 +3889,8 @@ define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64 ; GFX11-LABEL: test_imin_slt_i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_lt_i64_e64 s2, s[6:7], s[0:1] @@ -3915,8 +3929,8 @@ define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; CI-LABEL: test_imin_sle_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s4 @@ -3933,8 +3947,8 @@ define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; VI-LABEL: test_imin_sle_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s4 @@ -3951,16 +3965,16 @@ define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; GFX9-LABEL: test_imin_sle_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_cmp_le_i64_e32 vcc, s[2:3], v[0:1] -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: s_cselect_b32 s3, s3, s7 -; GFX9-NEXT: s_cselect_b32 s2, s2, s6 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s3, s3, s5 +; GFX9-NEXT: s_cselect_b32 s2, s2, s4 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -3969,14 +3983,14 @@ define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64 ; GFX10-LABEL: test_imin_sle_i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cmp_le_i64_e64 s4, s[2:3], s[6:7] -; GFX10-NEXT: s_and_b32 s4, s4, exec_lo -; GFX10-NEXT: s_cselect_b32 s2, s2, s6 -; GFX10-NEXT: s_cselect_b32 s3, s3, s7 +; GFX10-NEXT: v_cmp_le_i64_e64 s6, s[2:3], s[4:5] +; GFX10-NEXT: s_and_b32 s6, s6, exec_lo +; GFX10-NEXT: s_cselect_b32 s2, s2, s4 +; GFX10-NEXT: s_cselect_b32 s3, s3, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -3985,8 +3999,8 @@ define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64 ; GFX11-LABEL: test_imin_sle_i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_le_i64_e64 s2, s[6:7], s[0:1] @@ -4048,8 +4062,8 @@ define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addr ; ; CI-LABEL: v_test_imin_sle_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -4079,8 +4093,8 @@ define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addr ; ; VI-LABEL: v_test_imin_sle_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -4103,12 +4117,12 @@ define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: v_test_imin_sle_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_min_i16 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] @@ -4117,13 +4131,13 @@ define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addr ; GFX10-LABEL: v_test_imin_sle_v2i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-NEXT: global_load_dword v2, v0, s[6:7] +; GFX10-NEXT: global_load_dword v2, v0, s[4:5] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_min_i16 v1, v1, v2 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] @@ -4132,8 +4146,10 @@ define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addr ; GFX11-LABEL: v_test_imin_sle_v2i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -4198,8 +4214,8 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr ; ; CI-LABEL: v_test_imin_ule_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -4228,8 +4244,8 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr ; ; VI-LABEL: v_test_imin_ule_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -4252,12 +4268,12 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: v_test_imin_ule_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_min_u16 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] @@ -4266,13 +4282,13 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr ; GFX10-LABEL: v_test_imin_ule_v2i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-NEXT: global_load_dword v2, v0, s[6:7] +; GFX10-NEXT: global_load_dword v2, v0, s[4:5] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_min_u16 v1, v1, v2 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] @@ -4281,8 +4297,10 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr ; GFX11-LABEL: v_test_imin_ule_v2i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll index c98cfa08160ca..b696f097d05b7 100644 --- a/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll +++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll @@ -8,7 +8,7 @@ declare i64 @llvm.cttz.i64(i64, i1) nounwind readnone define amdgpu_kernel void @ctlz_i64_poison(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; GFX9-LABEL: ctlz_i64_poison: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v1, s[6:7] offset:5 @@ -45,7 +45,7 @@ define amdgpu_kernel void @ctlz_i64_poison(ptr addrspace(1) noalias %out, ptr ad ; ; GFX10-LABEL: ctlz_i64_poison: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x7 @@ -87,7 +87,7 @@ define amdgpu_kernel void @ctlz_i64_poison(ptr addrspace(1) noalias %out, ptr ad define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; GFX9-LABEL: ctlz_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v1, s[6:7] offset:5 @@ -125,7 +125,7 @@ define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace ; ; GFX10-LABEL: ctlz_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x7 @@ -168,7 +168,7 @@ define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace define amdgpu_kernel void @cttz_i64_poison(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; GFX9-LABEL: cttz_i64_poison: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v1, s[6:7] offset:5 @@ -205,7 +205,7 @@ define amdgpu_kernel void @cttz_i64_poison(ptr addrspace(1) noalias %out, ptr ad ; ; GFX10-LABEL: cttz_i64_poison: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x7 @@ -249,7 +249,7 @@ define amdgpu_kernel void @cttz_i64_poison(ptr addrspace(1) noalias %out, ptr ad define amdgpu_kernel void @cttz_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; GFX9-LABEL: cttz_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v1, s[6:7] offset:5 @@ -287,7 +287,7 @@ define amdgpu_kernel void @cttz_i64(ptr addrspace(1) noalias %out, ptr addrspace ; ; GFX10-LABEL: cttz_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x7 diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll index b4272049f36a4..0889f8ef6316e 100644 --- a/llvm/test/CodeGen/AMDGPU/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/mul.ll @@ -12,7 +12,7 @@ define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: test_mul_v2i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -31,7 +31,7 @@ define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: test_mul_v2i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -50,7 +50,7 @@ define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX9-LABEL: test_mul_v2i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -69,7 +69,7 @@ define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX10-LABEL: test_mul_v2i32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s10, s6 @@ -88,7 +88,7 @@ define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX11-LABEL: test_mul_v2i32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -109,7 +109,7 @@ define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX12-LABEL: test_mul_v2i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_mov_b32 s6, -1 ; GFX12-NEXT: s_mov_b32 s7, 0x31016000 ; GFX12-NEXT: s_mov_b32 s10, s6 @@ -157,7 +157,7 @@ entry: define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_mul_v4i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -179,7 +179,7 @@ define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-LABEL: v_mul_v4i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -201,7 +201,7 @@ define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: v_mul_v4i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -223,7 +223,7 @@ define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX10-LABEL: v_mul_v4i32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s10, s6 @@ -246,7 +246,7 @@ define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX11-LABEL: v_mul_v4i32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -271,7 +271,7 @@ define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX12-LABEL: v_mul_v4i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_mov_b32 s6, -1 ; GFX12-NEXT: s_mov_b32 s7, 0x31016000 ; GFX12-NEXT: s_mov_b32 s10, s6 @@ -326,9 +326,9 @@ entry: define amdgpu_kernel void @s_trunc_i64_mul_to_i32(ptr addrspace(1) %out, i64 %a, i64 %b) { ; SI-LABEL: s_trunc_i64_mul_to_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dword s7, s[0:1], 0xd +; SI-NEXT: s_load_dword s7, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s0, s4 @@ -341,9 +341,9 @@ define amdgpu_kernel void @s_trunc_i64_mul_to_i32(ptr addrspace(1) %out, i64 %a, ; ; VI-LABEL: s_trunc_i64_mul_to_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s7, s[0:1], 0x34 +; VI-NEXT: s_load_dword s7, s[2:3], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s4 @@ -356,10 +356,10 @@ define amdgpu_kernel void @s_trunc_i64_mul_to_i32(ptr addrspace(1) %out, i64 %a, ; ; GFX9-LABEL: s_trunc_i64_mul_to_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s7, s[0:1], 0x34 -; GFX9-NEXT: ; kill: killed $sgpr0_sgpr1 +; GFX9-NEXT: s_load_dword s7, s[2:3], 0x34 +; GFX9-NEXT: ; kill: killed $sgpr2_sgpr3 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_mov_b32 s0, s4 @@ -373,11 +373,11 @@ define amdgpu_kernel void @s_trunc_i64_mul_to_i32(ptr addrspace(1) %out, i64 %a, ; GFX10-LABEL: s_trunc_i64_mul_to_i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mul_i32 s0, s2, s6 +; GFX10-NEXT: s_mul_i32 s0, s0, s6 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -386,8 +386,8 @@ define amdgpu_kernel void @s_trunc_i64_mul_to_i32(ptr addrspace(1) %out, i64 %a, ; GFX11-LABEL: s_trunc_i64_mul_to_i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mul_i32 s0, s0, s6 @@ -401,8 +401,8 @@ define amdgpu_kernel void @s_trunc_i64_mul_to_i32(ptr addrspace(1) %out, i64 %a, ; GFX12-LABEL: s_trunc_i64_mul_to_i32: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 s7, 0x31016000 ; GFX12-NEXT: s_mul_i32 s0, s0, s6 @@ -433,98 +433,98 @@ entry: define amdgpu_kernel void @v_trunc_i64_mul_to_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; SI-LABEL: v_trunc_i64_mul_to_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s15, s3 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_lo_u32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_trunc_i64_mul_to_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s3 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_mov_b32 s2, s10 +; VI-NEXT: s_mov_b32 s3, s11 ; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; VI-NEXT: buffer_load_dword v1, off, s[8:11], 0 -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s5 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mul_lo_u32 v0, v1, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_trunc_i64_mul_to_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s14, s2 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_mov_b32 s11, 0xf000 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s14, s10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s12, s6 ; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: s_mov_b32 s15, s3 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: s_mov_b32 s2, s10 +; GFX9-NEXT: s_mov_b32 s3, s11 ; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; GFX9-NEXT: buffer_load_dword v1, off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GFX9-NEXT: s_mov_b32 s8, s4 +; GFX9-NEXT: s_mov_b32 s9, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, v1, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_trunc_i64_mul_to_i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; GFX10-NEXT: s_mov_b32 s2, -1 -; GFX10-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-NEXT: s_mov_b32 s14, s2 -; GFX10-NEXT: s_mov_b32 s15, s3 -; GFX10-NEXT: s_mov_b32 s10, s2 -; GFX10-NEXT: s_mov_b32 s11, s3 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_mov_b32 s10, -1 +; GFX10-NEXT: s_mov_b32 s11, 0x31016000 +; GFX10-NEXT: s_mov_b32 s14, s10 +; GFX10-NEXT: s_mov_b32 s15, s11 +; GFX10-NEXT: s_mov_b32 s2, s10 +; GFX10-NEXT: s_mov_b32 s3, s11 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_mov_b32 s12, s6 ; GFX10-NEXT: s_mov_b32 s13, s7 ; GFX10-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; GFX10-NEXT: buffer_load_dword v1, off, s[8:11], 0 -; GFX10-NEXT: s_mov_b32 s0, s4 -; GFX10-NEXT: s_mov_b32 s1, s5 +; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GFX10-NEXT: s_mov_b32 s8, s4 +; GFX10-NEXT: s_mov_b32 s9, s5 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_lo_u32 v0, v1, v0 -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_trunc_i64_mul_to_i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -548,8 +548,8 @@ define amdgpu_kernel void @v_trunc_i64_mul_to_i32(ptr addrspace(1) %out, ptr add ; GFX12-LABEL: v_trunc_i64_mul_to_i32: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-NEXT: s_mov_b32 s10, -1 ; GFX12-NEXT: s_mov_b32 s11, 0x31016000 ; GFX12-NEXT: s_mov_b32 s14, s10 @@ -603,8 +603,8 @@ entry: define amdgpu_kernel void @mul64_sext_c(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: mul64_sext_c: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: v_mov_b32_e32 v0, 0x50 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -617,11 +617,11 @@ define amdgpu_kernel void @mul64_sext_c(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: mul64_sext_c: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x50 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mad_i64_i32 v[0:1], s[2:3], s2, v0, 0 +; VI-NEXT: v_mad_i64_i32 v[0:1], s[2:3], s4, v0, 0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_nop 2 @@ -630,43 +630,43 @@ define amdgpu_kernel void @mul64_sext_c(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: mul64_sext_c: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_hi_i32 s0, s2, 0x50 -; GFX9-NEXT: s_mulk_i32 s2, 0x50 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX9-NEXT: s_mul_hi_i32 s5, s4, 0x50 +; GFX9-NEXT: s_mulk_i32 s4, 0x50 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: mul64_sext_c: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mul_i32 s0, s2, 0x50 -; GFX10-NEXT: s_mul_hi_i32 s1, s2, 0x50 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX10-NEXT: s_mul_i32 s2, s4, 0x50 +; GFX10-NEXT: s_mul_hi_i32 s3, s4, 0x50 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: mul64_sext_c: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mul_i32 s3, s2, 0x50 -; GFX11-NEXT: s_mul_hi_i32 s2, s2, 0x50 +; GFX11-NEXT: s_mul_i32 s2, s4, 0x50 +; GFX11-NEXT: s_mul_hi_i32 s3, s4, 0x50 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 @@ -676,7 +676,7 @@ define amdgpu_kernel void @mul64_sext_c(ptr addrspace(1) %out, i32 %in) { ; ; GFX12-LABEL: mul64_sext_c: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_ashr_i32 s3, s2, 31 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -711,8 +711,8 @@ entry: define amdgpu_kernel void @mul64_zext_c(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: mul64_zext_c: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: v_mov_b32_e32 v0, 0x50 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -725,11 +725,11 @@ define amdgpu_kernel void @mul64_zext_c(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: mul64_zext_c: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x50 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s2, v0, 0 +; VI-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s4, v0, 0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_nop 2 @@ -738,43 +738,43 @@ define amdgpu_kernel void @mul64_zext_c(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: mul64_zext_c: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_hi_u32 s0, s2, 0x50 -; GFX9-NEXT: s_mulk_i32 s2, 0x50 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX9-NEXT: s_mul_hi_u32 s5, s4, 0x50 +; GFX9-NEXT: s_mulk_i32 s4, 0x50 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: mul64_zext_c: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mul_i32 s0, s2, 0x50 -; GFX10-NEXT: s_mul_hi_u32 s1, s2, 0x50 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX10-NEXT: s_mul_i32 s2, s4, 0x50 +; GFX10-NEXT: s_mul_hi_u32 s3, s4, 0x50 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: mul64_zext_c: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mul_i32 s3, s2, 0x50 -; GFX11-NEXT: s_mul_hi_u32 s2, s2, 0x50 +; GFX11-NEXT: s_mul_i32 s2, s4, 0x50 +; GFX11-NEXT: s_mul_hi_u32 s3, s4, 0x50 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 @@ -784,7 +784,7 @@ define amdgpu_kernel void @mul64_zext_c(ptr addrspace(1) %out, i32 %in) { ; ; GFX12-LABEL: mul64_zext_c: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-NEXT: s_mov_b32 s3, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mul_u64 s[4:5], s[2:3], 0x50 @@ -818,7 +818,7 @@ entry: define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_mul64_sext_c: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -838,7 +838,7 @@ define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: v_mul64_sext_c: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -857,7 +857,7 @@ define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX9-LABEL: v_mul64_sext_c: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -877,7 +877,7 @@ define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX10-LABEL: v_mul64_sext_c: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s10, s6 @@ -896,7 +896,7 @@ define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX11-LABEL: v_mul64_sext_c: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -917,7 +917,7 @@ define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX12-LABEL: v_mul64_sext_c: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_mov_b32 s6, -1 ; GFX12-NEXT: s_mov_b32 s7, 0x31016000 ; GFX12-NEXT: s_mov_b32 s10, s6 @@ -965,7 +965,7 @@ entry: define amdgpu_kernel void @v_mul64_zext_c(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_mul64_zext_c: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -985,7 +985,7 @@ define amdgpu_kernel void @v_mul64_zext_c(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: v_mul64_zext_c: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1004,7 +1004,7 @@ define amdgpu_kernel void @v_mul64_zext_c(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX9-LABEL: v_mul64_zext_c: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -1024,7 +1024,7 @@ define amdgpu_kernel void @v_mul64_zext_c(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX10-LABEL: v_mul64_zext_c: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s10, s6 @@ -1043,7 +1043,7 @@ define amdgpu_kernel void @v_mul64_zext_c(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX11-LABEL: v_mul64_zext_c: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -1064,7 +1064,7 @@ define amdgpu_kernel void @v_mul64_zext_c(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX12-LABEL: v_mul64_zext_c: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_mov_b32 s6, -1 ; GFX12-NEXT: s_mov_b32 s7, 0x31016000 ; GFX12-NEXT: s_mov_b32 s10, s6 @@ -1112,7 +1112,7 @@ entry: define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_mul64_sext_inline_imm: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1131,7 +1131,7 @@ define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: v_mul64_sext_inline_imm: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1149,7 +1149,7 @@ define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr ad ; ; GFX9-LABEL: v_mul64_sext_inline_imm: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -1168,7 +1168,7 @@ define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr ad ; ; GFX10-LABEL: v_mul64_sext_inline_imm: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s10, s6 @@ -1187,7 +1187,7 @@ define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_mul64_sext_inline_imm: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -1208,7 +1208,7 @@ define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr ad ; ; GFX12-LABEL: v_mul64_sext_inline_imm: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_mov_b32 s6, -1 ; GFX12-NEXT: s_mov_b32 s7, 0x31016000 ; GFX12-NEXT: s_mov_b32 s10, s6 @@ -1256,9 +1256,9 @@ entry: define amdgpu_kernel void @s_mul_i32(ptr addrspace(1) %out, [8 x i32], i32 %a, [8 x i32], i32 %b) nounwind { ; SI-LABEL: s_mul_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0x13 -; SI-NEXT: s_load_dword s5, s[0:1], 0x1c -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0x13 +; SI-NEXT: s_load_dword s5, s[2:3], 0x1c +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1269,9 +1269,9 @@ define amdgpu_kernel void @s_mul_i32(ptr addrspace(1) %out, [8 x i32], i32 %a, [ ; ; VI-LABEL: s_mul_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x4c -; VI-NEXT: s_load_dword s5, s[0:1], 0x70 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x4c +; VI-NEXT: s_load_dword s5, s[2:3], 0x70 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1282,40 +1282,41 @@ define amdgpu_kernel void @s_mul_i32(ptr addrspace(1) %out, [8 x i32], i32 %a, [ ; ; GFX9-LABEL: s_mul_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x4c -; GFX9-NEXT: s_load_dword s3, s[0:1], 0x70 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x4c +; GFX9-NEXT: s_load_dword s5, s[2:3], 0x70 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s0, s2, s3 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_mul_i32 s4, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_mul_i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x4c -; GFX10-NEXT: s_load_dword s3, s[0:1], 0x70 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x4c +; GFX10-NEXT: s_load_dword s5, s[2:3], 0x70 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mul_i32 s0, s2, s3 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX10-NEXT: s_mul_i32 s2, s4, s5 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_mul_i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x4c -; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x70 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mul_i32 s2, s2, s3 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x4c +; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x70 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mul_i32 s2, s4, s5 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -1326,12 +1327,13 @@ define amdgpu_kernel void @s_mul_i32(ptr addrspace(1) %out, [8 x i32], i32 %a, [ ; GFX12-LABEL: s_mul_i32: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x2 -; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x4c -; GFX12-NEXT: s_load_b32 s3, s[0:1], 0x70 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_mul_i32 s2, s2, s3 +; GFX12-NEXT: s_load_b32 s4, s[2:3], 0x4c +; GFX12-NEXT: s_load_b32 s5, s[2:3], 0x70 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-NEXT: s_mov_b32 s3, 0x31016000 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mul_i32 s2, s4, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: s_mov_b32 s2, -1 ; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null @@ -1358,7 +1360,7 @@ entry: define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_mul_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1376,7 +1378,7 @@ define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; VI-LABEL: v_mul_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1394,7 +1396,7 @@ define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX9-LABEL: v_mul_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -1412,7 +1414,7 @@ define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX10-LABEL: v_mul_i32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s10, s6 @@ -1430,7 +1432,7 @@ define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX11-LABEL: v_mul_i32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -1450,7 +1452,7 @@ define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX12-LABEL: v_mul_i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_mov_b32 s6, -1 ; GFX12-NEXT: s_mov_b32 s7, 0x31016000 ; GFX12-NEXT: s_mov_b32 s10, s6 @@ -1496,9 +1498,9 @@ entry: define amdgpu_kernel void @s_mul_i1(ptr addrspace(1) %out, [8 x i32], i1 %a, [8 x i32], i1 %b) nounwind { ; SI-LABEL: s_mul_i1: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0x13 -; SI-NEXT: s_load_dword s5, s[0:1], 0x1c -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0x13 +; SI-NEXT: s_load_dword s5, s[2:3], 0x1c +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1510,9 +1512,9 @@ define amdgpu_kernel void @s_mul_i1(ptr addrspace(1) %out, [8 x i32], i1 %a, [8 ; ; VI-LABEL: s_mul_i1: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x70 -; VI-NEXT: s_load_dword s5, s[0:1], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x70 +; VI-NEXT: s_load_dword s5, s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1524,42 +1526,42 @@ define amdgpu_kernel void @s_mul_i1(ptr addrspace(1) %out, [8 x i32], i1 %a, [8 ; ; GFX9-LABEL: s_mul_i1: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x70 -; GFX9-NEXT: s_load_dword s3, s[0:1], 0x4c -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x70 +; GFX9-NEXT: s_load_dword s5, s[2:3], 0x4c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mul_lo_u16_e32 v0, s3, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mul_lo_u16_e32 v0, s5, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_mul_i1: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x4c -; GFX10-NEXT: s_load_dword s3, s[0:1], 0x70 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x4c +; GFX10-NEXT: s_load_dword s5, s[2:3], 0x70 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mul_lo_u16 v0, s2, s3 +; GFX10-NEXT: v_mul_lo_u16 v0, s4, s5 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_mul_i1: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x4c -; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x70 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mul_lo_u16 v0, s2, s3 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x4c +; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x70 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mul_lo_u16 v0, s4, s5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX11-NEXT: buffer_store_b8 v0, off, s[0:3], 0 @@ -1570,13 +1572,13 @@ define amdgpu_kernel void @s_mul_i1(ptr addrspace(1) %out, [8 x i32], i1 %a, [8 ; GFX12-LABEL: s_mul_i1: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x2 -; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x4c -; GFX12-NEXT: s_load_b32 s3, s[0:1], 0x70 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mul_lo_u16 v0, s2, s3 +; GFX12-NEXT: s_load_b32 s4, s[2:3], 0x4c +; GFX12-NEXT: s_load_b32 s5, s[2:3], 0x70 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-NEXT: s_mov_b32 s3, 0x31016000 ; GFX12-NEXT: s_mov_b32 s2, -1 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mul_lo_u16 v0, s4, s5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX12-NEXT: buffer_store_b8 v0, off, s[0:3], null @@ -1620,7 +1622,7 @@ entry: define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_mul_i1: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1640,7 +1642,7 @@ define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; VI-LABEL: v_mul_i1: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1660,7 +1662,7 @@ define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; GFX9-LABEL: v_mul_i1: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -1680,7 +1682,7 @@ define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; GFX10-LABEL: v_mul_i1: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s10, s6 @@ -1701,7 +1703,7 @@ define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; GFX11-LABEL: v_mul_i1: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -1725,7 +1727,7 @@ define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; GFX12-LABEL: v_mul_i1: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_mov_b32 s6, -1 ; GFX12-NEXT: s_mov_b32 s7, 0x31016000 ; GFX12-NEXT: s_mov_b32 s10, s6 @@ -1793,8 +1795,8 @@ entry: define amdgpu_kernel void @s_mul_i64(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind { ; SI-LABEL: s_mul_i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1813,8 +1815,8 @@ define amdgpu_kernel void @s_mul_i64(ptr addrspace(1) %out, i64 %a, i64 %b) noun ; ; VI-LABEL: s_mul_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1831,8 +1833,8 @@ define amdgpu_kernel void @s_mul_i64(ptr addrspace(1) %out, i64 %a, i64 %b) noun ; ; GFX9-LABEL: s_mul_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1852,18 +1854,18 @@ define amdgpu_kernel void @s_mul_i64(ptr addrspace(1) %out, i64 %a, i64 %b) noun ; GFX10-LABEL: s_mul_i64: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mul_i32 s0, s6, s3 -; GFX10-NEXT: s_mul_hi_u32 s1, s6, s2 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-NEXT: s_add_i32 s0, s1, s0 -; GFX10-NEXT: s_mul_i32 s1, s7, s2 -; GFX10-NEXT: s_mul_i32 s2, s6, s2 -; GFX10-NEXT: s_add_i32 s0, s0, s1 -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mul_i32 s1, s6, s1 +; GFX10-NEXT: s_mul_hi_u32 s2, s6, s0 +; GFX10-NEXT: s_add_i32 s1, s2, s1 +; GFX10-NEXT: s_mul_i32 s2, s7, s0 +; GFX10-NEXT: s_mul_i32 s0, s6, s0 +; GFX10-NEXT: s_add_i32 s1, s1, s2 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: s_mov_b32 s2, -1 ; GFX10-NEXT: s_mov_b32 s0, s4 ; GFX10-NEXT: s_mov_b32 s1, s5 @@ -1873,8 +1875,8 @@ define amdgpu_kernel void @s_mul_i64(ptr addrspace(1) %out, i64 %a, i64 %b) noun ; GFX11-LABEL: s_mul_i64: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_mul_i32 s1, s6, s1 @@ -1896,8 +1898,8 @@ define amdgpu_kernel void @s_mul_i64(ptr addrspace(1) %out, i64 %a, i64 %b) noun ; GFX12-LABEL: s_mul_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mul_u64 s[0:1], s[6:7], s[0:1] ; GFX12-NEXT: s_mov_b32 s7, 0x31016000 @@ -1932,21 +1934,21 @@ entry: define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) { ; SI-LABEL: v_mul_i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s14, s2 -; SI-NEXT: s_mov_b32 s15, s3 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_lo_u32 v1, v2, v1 ; SI-NEXT: v_mul_hi_u32 v4, v2, v0 @@ -1954,52 +1956,52 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap ; SI-NEXT: v_mul_lo_u32 v0, v2, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, v1, v4 ; SI-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_mul_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s2, s10 +; VI-NEXT: s_mov_b32 s3, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s2 -; VI-NEXT: s_mov_b32 s15, s3 -; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s5 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mul_lo_u32 v4, v2, v1 -; VI-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v2, v0, 0 +; VI-NEXT: v_mad_u64_u32 v[1:2], s[0:1], v2, v0, 0 ; VI-NEXT: v_mul_lo_u32 v0, v3, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, v4, v2 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx2 v[1:2], off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_mul_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_mov_b32 s11, 0xf000 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s2, s10 +; GFX9-NEXT: s_mov_b32 s3, s11 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s12, s6 ; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: s_mov_b32 s14, s2 -; GFX9-NEXT: s_mov_b32 s15, s3 -; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_mov_b32 s8, s4 +; GFX9-NEXT: s_mov_b32 s9, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v1, v2, v1 ; GFX9-NEXT: v_mul_hi_u32 v4, v2, v0 @@ -2007,27 +2009,27 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap ; GFX9-NEXT: v_mul_lo_u32 v0, v2, v0 ; GFX9-NEXT: v_add_u32_e32 v1, v4, v1 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_mul_i64: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; GFX10-NEXT: s_mov_b32 s2, -1 -; GFX10-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-NEXT: s_mov_b32 s10, s2 -; GFX10-NEXT: s_mov_b32 s11, s3 -; GFX10-NEXT: s_mov_b32 s14, s2 -; GFX10-NEXT: s_mov_b32 s15, s3 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_mov_b32 s10, -1 +; GFX10-NEXT: s_mov_b32 s11, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, s10 +; GFX10-NEXT: s_mov_b32 s3, s11 +; GFX10-NEXT: s_mov_b32 s14, s10 +; GFX10-NEXT: s_mov_b32 s15, s11 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_mov_b32 s12, s6 ; GFX10-NEXT: s_mov_b32 s13, s7 -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; GFX10-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 -; GFX10-NEXT: s_mov_b32 s0, s4 -; GFX10-NEXT: s_mov_b32 s1, s5 +; GFX10-NEXT: s_mov_b32 s8, s4 +; GFX10-NEXT: s_mov_b32 s9, s5 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_lo_u32 v1, v2, v1 ; GFX10-NEXT: v_mul_hi_u32 v4, v2, v0 @@ -2035,14 +2037,14 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap ; GFX10-NEXT: v_mul_lo_u32 v0, v2, v0 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v4, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 -; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_mul_i64: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, s10 @@ -2072,8 +2074,8 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap ; GFX12-LABEL: v_mul_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-NEXT: s_mov_b32 s10, -1 ; GFX12-NEXT: s_mov_b32 s11, 0x31016000 ; GFX12-NEXT: s_mov_b32 s2, s10 @@ -2134,19 +2136,19 @@ entry: define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %a, i32 %b, i32 %c) { ; SI-LABEL: mul32_in_branch: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s2, 0 +; SI-NEXT: s_cmp_lg_u32 s0, 0 ; SI-NEXT: s_cbranch_scc0 .LBB15_2 ; SI-NEXT: ; %bb.1: ; %else -; SI-NEXT: s_mul_i32 s6, s2, s3 +; SI-NEXT: s_mul_i32 s6, s0, s1 ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: s_branch .LBB15_3 ; SI-NEXT: .LBB15_2: ; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: .LBB15_3: ; %Flow -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b64 vcc, vcc @@ -2169,19 +2171,19 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: mul32_in_branch: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s2, 0 +; VI-NEXT: s_cmp_lg_u32 s0, 0 ; VI-NEXT: s_cbranch_scc0 .LBB15_2 ; VI-NEXT: ; %bb.1: ; %else -; VI-NEXT: s_mul_i32 s6, s2, s3 +; VI-NEXT: s_mul_i32 s6, s0, s1 ; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_branch .LBB15_3 ; VI-NEXT: .LBB15_2: ; VI-NEXT: s_mov_b64 s[4:5], -1 ; VI-NEXT: ; implicit-def: $sgpr6 ; VI-NEXT: .LBB15_3: ; %Flow -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; VI-NEXT: s_cbranch_vccnz .LBB15_5 ; VI-NEXT: ; %bb.4: ; %if @@ -2204,19 +2206,19 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: mul32_in_branch: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_cmp_lg_u32 s2, 0 +; GFX9-NEXT: s_cmp_lg_u32 s0, 0 ; GFX9-NEXT: s_cbranch_scc0 .LBB15_2 ; GFX9-NEXT: ; %bb.1: ; %else -; GFX9-NEXT: s_mul_i32 s6, s2, s3 +; GFX9-NEXT: s_mul_i32 s6, s0, s1 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_branch .LBB15_3 ; GFX9-NEXT: .LBB15_2: ; GFX9-NEXT: s_mov_b64 s[4:5], -1 ; GFX9-NEXT: ; implicit-def: $sgpr6 ; GFX9-NEXT: .LBB15_3: ; %Flow -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GFX9-NEXT: s_cbranch_vccnz .LBB15_5 ; GFX9-NEXT: ; %bb.4: ; %if @@ -2239,19 +2241,19 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10-LABEL: mul32_in_branch: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_cmp_lg_u32 s2, 0 +; GFX10-NEXT: s_cmp_lg_u32 s0, 0 ; GFX10-NEXT: s_cbranch_scc0 .LBB15_2 ; GFX10-NEXT: ; %bb.1: ; %else -; GFX10-NEXT: s_mul_i32 s5, s2, s3 +; GFX10-NEXT: s_mul_i32 s5, s0, s1 ; GFX10-NEXT: s_branch .LBB15_3 ; GFX10-NEXT: .LBB15_2: ; GFX10-NEXT: s_mov_b32 s4, -1 ; GFX10-NEXT: ; implicit-def: $sgpr5 ; GFX10-NEXT: .LBB15_3: ; %Flow -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: s_andn2_b32 vcc_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_vccnz .LBB15_5 ; GFX10-NEXT: ; %bb.4: ; %if @@ -2274,19 +2276,19 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11-LABEL: mul32_in_branch: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_cmp_lg_u32 s0, 0 ; GFX11-NEXT: s_cbranch_scc0 .LBB15_2 ; GFX11-NEXT: ; %bb.1: ; %else -; GFX11-NEXT: s_mul_i32 s5, s2, s3 +; GFX11-NEXT: s_mul_i32 s5, s0, s1 ; GFX11-NEXT: s_branch .LBB15_3 ; GFX11-NEXT: .LBB15_2: ; GFX11-NEXT: s_mov_b32 s4, -1 ; GFX11-NEXT: ; implicit-def: $sgpr5 ; GFX11-NEXT: .LBB15_3: ; %Flow -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-NEXT: s_cbranch_vccnz .LBB15_5 ; GFX11-NEXT: ; %bb.4: ; %if @@ -2311,19 +2313,19 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX12-LABEL: mul32_in_branch: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_cmp_lg_u32 s2, 0 +; GFX12-NEXT: s_cmp_lg_u32 s0, 0 ; GFX12-NEXT: s_cbranch_scc0 .LBB15_2 ; GFX12-NEXT: ; %bb.1: ; %else -; GFX12-NEXT: s_mul_i32 s5, s2, s3 +; GFX12-NEXT: s_mul_i32 s5, s0, s1 ; GFX12-NEXT: s_branch .LBB15_3 ; GFX12-NEXT: .LBB15_2: ; GFX12-NEXT: s_mov_b32 s4, -1 ; GFX12-NEXT: ; implicit-def: $sgpr5 ; GFX12-NEXT: .LBB15_3: ; %Flow -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX12-NEXT: s_cbranch_vccnz .LBB15_5 ; GFX12-NEXT: ; %bb.4: ; %if @@ -2403,7 +2405,7 @@ endif: define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %a, i64 %b, i64 %c) { ; SI-LABEL: mul64_in_branch: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-NEXT: s_mov_b64 s[8:9], 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u64_e64 s[10:11], s[4:5], 0 @@ -2438,7 +2440,7 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: mul64_in_branch: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_mov_b64 s[8:9], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u64 s[4:5], 0 @@ -2470,7 +2472,7 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: mul64_in_branch: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[8:9], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 @@ -2506,7 +2508,7 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10-LABEL: mul64_in_branch: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX10-NEXT: s_cbranch_scc0 .LBB16_3 @@ -2540,7 +2542,7 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11-LABEL: mul64_in_branch: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX11-NEXT: s_cbranch_scc0 .LBB16_3 @@ -2575,7 +2577,7 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX12-LABEL: mul64_in_branch: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX12-NEXT: s_cbranch_scc0 .LBB16_3 @@ -2668,9 +2670,9 @@ endif: define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a, [8 x i32], i128 %b) nounwind #0 { ; SI-LABEL: s_mul_i128: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x13 -; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x1f -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x13 +; SI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x1f +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2717,9 +2719,9 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a, ; ; VI-LABEL: s_mul_i128: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4c -; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x7c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4c +; VI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x7c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v5, 0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2756,96 +2758,96 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a, ; ; GFX9-LABEL: s_mul_i128: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x4c -; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x7c -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4c +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x7c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s0, s12, s11 -; GFX9-NEXT: s_mul_hi_u32 s1, s12, s10 -; GFX9-NEXT: s_mul_i32 s2, s14, s9 -; GFX9-NEXT: s_mul_hi_u32 s3, s14, s8 -; GFX9-NEXT: s_add_i32 s0, s1, s0 -; GFX9-NEXT: s_mul_i32 s1, s13, s10 -; GFX9-NEXT: s_add_i32 s2, s3, s2 -; GFX9-NEXT: s_mul_i32 s3, s15, s8 -; GFX9-NEXT: s_add_i32 s0, s0, s1 -; GFX9-NEXT: s_mul_i32 s1, s12, s10 -; GFX9-NEXT: s_add_i32 s2, s2, s3 -; GFX9-NEXT: s_mul_i32 s3, s14, s8 -; GFX9-NEXT: s_add_u32 s3, s3, s1 -; GFX9-NEXT: s_addc_u32 s2, s2, s0 -; GFX9-NEXT: s_mul_i32 s14, s9, s12 -; GFX9-NEXT: s_mul_hi_u32 s15, s8, s12 -; GFX9-NEXT: s_mul_hi_u32 s11, s9, s12 +; GFX9-NEXT: s_mul_i32 s7, s8, s7 +; GFX9-NEXT: s_mul_hi_u32 s12, s8, s6 +; GFX9-NEXT: s_add_i32 s7, s12, s7 +; GFX9-NEXT: s_mul_i32 s12, s9, s6 +; GFX9-NEXT: s_add_i32 s7, s7, s12 +; GFX9-NEXT: s_mul_i32 s12, s10, s5 +; GFX9-NEXT: s_mul_hi_u32 s13, s10, s4 +; GFX9-NEXT: s_add_i32 s12, s13, s12 +; GFX9-NEXT: s_mul_i32 s11, s11, s4 +; GFX9-NEXT: s_mul_i32 s6, s8, s6 +; GFX9-NEXT: s_add_i32 s12, s12, s11 +; GFX9-NEXT: s_mul_i32 s10, s10, s4 +; GFX9-NEXT: s_add_u32 s10, s10, s6 +; GFX9-NEXT: s_addc_u32 s11, s12, s7 +; GFX9-NEXT: s_mul_i32 s14, s5, s8 +; GFX9-NEXT: s_mul_hi_u32 s15, s4, s8 +; GFX9-NEXT: s_mul_hi_u32 s13, s5, s8 ; GFX9-NEXT: s_add_u32 s14, s14, s15 -; GFX9-NEXT: s_mul_i32 s1, s8, s13 -; GFX9-NEXT: s_addc_u32 s11, s11, 0 -; GFX9-NEXT: s_mul_hi_u32 s10, s8, s13 -; GFX9-NEXT: s_add_u32 s1, s1, s14 -; GFX9-NEXT: s_addc_u32 s10, s10, 0 -; GFX9-NEXT: s_add_u32 s10, s11, s10 -; GFX9-NEXT: s_addc_u32 s11, 0, 0 -; GFX9-NEXT: s_mul_hi_u32 s14, s9, s13 -; GFX9-NEXT: s_mul_i32 s9, s9, s13 -; GFX9-NEXT: s_add_u32 s9, s9, s10 -; GFX9-NEXT: s_addc_u32 s10, s14, s11 -; GFX9-NEXT: s_mov_b32 s0, 0 -; GFX9-NEXT: s_add_u32 s9, s9, s3 -; GFX9-NEXT: s_addc_u32 s10, s10, s2 -; GFX9-NEXT: s_mul_i32 s2, s8, s12 -; GFX9-NEXT: s_mov_b32 s3, s0 -; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s9 -; GFX9-NEXT: v_mov_b32_e32 v3, s10 -; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX9-NEXT: s_mul_i32 s7, s4, s9 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: s_mul_hi_u32 s12, s4, s9 +; GFX9-NEXT: s_add_u32 s7, s7, s14 +; GFX9-NEXT: s_addc_u32 s12, s12, 0 +; GFX9-NEXT: s_add_u32 s12, s13, s12 +; GFX9-NEXT: s_addc_u32 s13, 0, 0 +; GFX9-NEXT: s_mul_hi_u32 s14, s5, s9 +; GFX9-NEXT: s_mul_i32 s5, s5, s9 +; GFX9-NEXT: s_add_u32 s5, s5, s12 +; GFX9-NEXT: s_mov_b32 s6, 0 +; GFX9-NEXT: s_addc_u32 s9, s14, s13 +; GFX9-NEXT: s_add_u32 s10, s5, s10 +; GFX9-NEXT: s_mul_i32 s4, s4, s8 +; GFX9-NEXT: s_mov_b32 s5, s6 +; GFX9-NEXT: s_addc_u32 s9, s9, s11 +; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_mul_i128: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4c -; GFX10-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x7c -; GFX10-NEXT: s_mov_b32 s2, 0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX10-NEXT: s_mov_b32 s13, s2 +; GFX10-NEXT: s_clause 0x2 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4c +; GFX10-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x7c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_mov_b32 s12, 0 +; GFX10-NEXT: s_mov_b32 s3, s12 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mul_i32 s3, s8, s7 +; GFX10-NEXT: s_mul_i32 s2, s8, s7 ; GFX10-NEXT: s_mul_hi_u32 s7, s8, s6 ; GFX10-NEXT: s_mul_i32 s14, s10, s5 ; GFX10-NEXT: s_mul_hi_u32 s15, s10, s4 -; GFX10-NEXT: s_mul_i32 s12, s9, s6 +; GFX10-NEXT: s_mul_i32 s13, s9, s6 ; GFX10-NEXT: s_mul_i32 s11, s11, s4 -; GFX10-NEXT: s_add_i32 s3, s7, s3 +; GFX10-NEXT: s_add_i32 s2, s7, s2 ; GFX10-NEXT: s_add_i32 s7, s15, s14 ; GFX10-NEXT: s_mul_i32 s6, s8, s6 ; GFX10-NEXT: s_mul_i32 s10, s10, s4 -; GFX10-NEXT: s_add_i32 s3, s3, s12 +; GFX10-NEXT: s_add_i32 s2, s2, s13 ; GFX10-NEXT: s_add_i32 s7, s7, s11 ; GFX10-NEXT: s_mul_i32 s19, s5, s8 ; GFX10-NEXT: s_mul_hi_u32 s20, s4, s8 ; GFX10-NEXT: s_add_u32 s6, s10, s6 ; GFX10-NEXT: s_mul_hi_u32 s18, s5, s8 -; GFX10-NEXT: s_addc_u32 s7, s7, s3 +; GFX10-NEXT: s_addc_u32 s7, s7, s2 ; GFX10-NEXT: s_mul_i32 s17, s4, s9 -; GFX10-NEXT: s_add_u32 s3, s19, s20 +; GFX10-NEXT: s_add_u32 s2, s19, s20 ; GFX10-NEXT: s_mul_hi_u32 s16, s4, s9 ; GFX10-NEXT: s_mul_hi_u32 s21, s5, s9 ; GFX10-NEXT: s_mul_i32 s5, s5, s9 ; GFX10-NEXT: s_addc_u32 s9, s18, 0 -; GFX10-NEXT: s_add_u32 s3, s17, s3 +; GFX10-NEXT: s_add_u32 s13, s17, s2 ; GFX10-NEXT: s_addc_u32 s10, s16, 0 -; GFX10-NEXT: s_mul_i32 s12, s4, s8 +; GFX10-NEXT: s_mul_i32 s2, s4, s8 ; GFX10-NEXT: s_add_u32 s4, s9, s10 ; GFX10-NEXT: s_addc_u32 s8, 0, 0 ; GFX10-NEXT: s_add_u32 s4, s5, s4 ; GFX10-NEXT: s_addc_u32 s5, s21, s8 ; GFX10-NEXT: s_add_u32 s4, s4, s6 ; GFX10-NEXT: s_addc_u32 s5, s5, s7 -; GFX10-NEXT: s_or_b64 s[2:3], s[12:13], s[2:3] +; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[12:13] ; GFX10-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s3 @@ -2858,46 +2860,46 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a, ; GFX11-LABEL: s_mul_i128: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x4c -; GFX11-NEXT: s_load_b128 s[8:11], s[0:1], 0x7c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x4c +; GFX11-NEXT: s_load_b128 s[8:11], s[2:3], 0x7c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_mov_b32 s12, 0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_mov_b32 s13, s2 +; GFX11-NEXT: s_mov_b32 s3, s12 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mul_i32 s3, s8, s7 +; GFX11-NEXT: s_mul_i32 s2, s8, s7 ; GFX11-NEXT: s_mul_hi_u32 s7, s8, s6 ; GFX11-NEXT: s_mul_i32 s14, s10, s5 ; GFX11-NEXT: s_mul_hi_u32 s15, s10, s4 -; GFX11-NEXT: s_mul_i32 s12, s9, s6 +; GFX11-NEXT: s_mul_i32 s13, s9, s6 ; GFX11-NEXT: s_mul_i32 s11, s11, s4 -; GFX11-NEXT: s_add_i32 s3, s7, s3 +; GFX11-NEXT: s_add_i32 s2, s7, s2 ; GFX11-NEXT: s_add_i32 s7, s15, s14 ; GFX11-NEXT: s_mul_i32 s6, s8, s6 ; GFX11-NEXT: s_mul_i32 s10, s10, s4 -; GFX11-NEXT: s_add_i32 s3, s3, s12 +; GFX11-NEXT: s_add_i32 s2, s2, s13 ; GFX11-NEXT: s_add_i32 s7, s7, s11 ; GFX11-NEXT: s_mul_i32 s19, s5, s8 ; GFX11-NEXT: s_mul_hi_u32 s20, s4, s8 ; GFX11-NEXT: s_add_u32 s6, s10, s6 ; GFX11-NEXT: s_mul_hi_u32 s18, s5, s8 -; GFX11-NEXT: s_addc_u32 s7, s7, s3 +; GFX11-NEXT: s_addc_u32 s7, s7, s2 ; GFX11-NEXT: s_mul_i32 s17, s4, s9 -; GFX11-NEXT: s_add_u32 s3, s19, s20 +; GFX11-NEXT: s_add_u32 s2, s19, s20 ; GFX11-NEXT: s_mul_hi_u32 s16, s4, s9 ; GFX11-NEXT: s_mul_hi_u32 s21, s5, s9 ; GFX11-NEXT: s_mul_i32 s5, s5, s9 ; GFX11-NEXT: s_addc_u32 s9, s18, 0 -; GFX11-NEXT: s_add_u32 s3, s17, s3 +; GFX11-NEXT: s_add_u32 s13, s17, s2 ; GFX11-NEXT: s_addc_u32 s10, s16, 0 -; GFX11-NEXT: s_mul_i32 s12, s4, s8 +; GFX11-NEXT: s_mul_i32 s2, s4, s8 ; GFX11-NEXT: s_add_u32 s4, s9, s10 ; GFX11-NEXT: s_addc_u32 s8, 0, 0 ; GFX11-NEXT: s_add_u32 s4, s5, s4 ; GFX11-NEXT: s_addc_u32 s5, s21, s8 ; GFX11-NEXT: s_add_u32 s4, s4, s6 ; GFX11-NEXT: s_addc_u32 s5, s5, s7 -; GFX11-NEXT: s_or_b64 s[2:3], s[12:13], s[2:3] +; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[12:13] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s5 @@ -2911,40 +2913,40 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a, ; GFX12-LABEL: s_mul_i128: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x7c -; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x4c -; GFX12-NEXT: s_mov_b32 s3, 0 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: s_mov_b32 s15, s3 -; GFX12-NEXT: s_mov_b32 s13, s3 -; GFX12-NEXT: s_mov_b32 s17, s3 -; GFX12-NEXT: s_mov_b32 s19, s3 -; GFX12-NEXT: s_mov_b32 s24, s3 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x7c +; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x4c +; GFX12-NEXT: s_mov_b32 s13, 0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b32 s15, s13 +; GFX12-NEXT: s_mov_b32 s3, s13 +; GFX12-NEXT: s_mov_b32 s17, s13 +; GFX12-NEXT: s_mov_b32 s19, s13 +; GFX12-NEXT: s_mov_b32 s24, s13 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_mov_b32 s2, s4 +; GFX12-NEXT: s_mov_b32 s12, s4 ; GFX12-NEXT: s_mov_b32 s14, s8 -; GFX12-NEXT: s_mov_b32 s12, s9 -; GFX12-NEXT: s_mul_u64 s[22:23], s[14:15], s[2:3] -; GFX12-NEXT: s_mul_u64 s[20:21], s[12:13], s[2:3] -; GFX12-NEXT: s_mov_b32 s2, s23 +; GFX12-NEXT: s_mov_b32 s2, s9 +; GFX12-NEXT: s_mul_u64 s[22:23], s[14:15], s[12:13] +; GFX12-NEXT: s_mul_u64 s[20:21], s[2:3], s[12:13] +; GFX12-NEXT: s_mov_b32 s12, s23 ; GFX12-NEXT: s_mov_b32 s16, s5 ; GFX12-NEXT: s_mul_u64 s[4:5], s[4:5], s[10:11] -; GFX12-NEXT: s_add_nc_u64 s[10:11], s[20:21], s[2:3] +; GFX12-NEXT: s_add_nc_u64 s[10:11], s[20:21], s[12:13] ; GFX12-NEXT: s_mul_u64 s[6:7], s[6:7], s[8:9] ; GFX12-NEXT: s_mul_u64 s[8:9], s[14:15], s[16:17] -; GFX12-NEXT: s_mov_b32 s2, s11 -; GFX12-NEXT: s_mov_b32 s11, s3 +; GFX12-NEXT: s_mov_b32 s12, s11 +; GFX12-NEXT: s_mov_b32 s11, s13 ; GFX12-NEXT: s_add_nc_u64 s[4:5], s[6:7], s[4:5] ; GFX12-NEXT: s_add_nc_u64 s[6:7], s[8:9], s[10:11] -; GFX12-NEXT: s_mul_u64 s[12:13], s[12:13], s[16:17] +; GFX12-NEXT: s_mul_u64 s[2:3], s[2:3], s[16:17] ; GFX12-NEXT: s_mov_b32 s18, s7 -; GFX12-NEXT: s_mov_b32 s23, s3 -; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[18:19] ; GFX12-NEXT: s_mov_b32 s25, s6 -; GFX12-NEXT: s_add_nc_u64 s[2:3], s[12:13], s[2:3] -; GFX12-NEXT: s_or_b64 s[6:7], s[22:23], s[24:25] +; GFX12-NEXT: s_add_nc_u64 s[6:7], s[12:13], s[18:19] +; GFX12-NEXT: s_mov_b32 s23, s13 +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[6:7] +; GFX12-NEXT: s_or_b64 s[8:9], s[22:23], s[24:25] ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] -; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: s_mov_b32 s3, 0x31016000 ; GFX12-NEXT: s_mov_b32 s2, -1 @@ -3011,7 +3013,7 @@ entry: define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) #0 { ; SI-LABEL: v_mul_i128: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0xb ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v8, 4, v0 @@ -3060,7 +3062,7 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a ; ; VI-LABEL: v_mul_i128: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v2, 4, v0 ; VI-NEXT: v_mov_b32_e32 v11, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3100,7 +3102,7 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a ; ; GFX9-LABEL: v_mul_i128: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v13, 4, v0 ; GFX9-NEXT: v_mov_b32_e32 v10, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -3131,7 +3133,7 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a ; ; GFX10-LABEL: v_mul_i128: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x2c ; GFX10-NEXT: v_lshlrev_b32_e32 v13, 4, v0 ; GFX10-NEXT: v_mov_b32_e32 v10, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -3163,7 +3165,9 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a ; ; GFX11-LABEL: v_mul_i128: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x2c +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x2c +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_lshlrev_b32 v15, 4, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -3201,7 +3205,9 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a ; ; GFX12-LABEL: v_mul_i128: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x2c +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x2c +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_lshlrev_b32 v13, 4, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 diff --git a/llvm/test/CodeGen/AMDGPU/mul_int24.ll b/llvm/test/CodeGen/AMDGPU/mul_int24.ll index 58fd4b9bd2fee..be77a10380c49 100644 --- a/llvm/test/CodeGen/AMDGPU/mul_int24.ll +++ b/llvm/test/CodeGen/AMDGPU/mul_int24.ll @@ -9,7 +9,7 @@ define amdgpu_kernel void @test_smul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { ; SI-LABEL: test_smul24_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bfe_i32 s2, s2, 0x180000 @@ -24,7 +24,7 @@ define amdgpu_kernel void @test_smul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b ; ; VI-LABEL: test_smul24_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -39,7 +39,7 @@ define amdgpu_kernel void @test_smul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b ; ; GFX9-LABEL: test_smul24_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -100,7 +100,7 @@ entry: define amdgpu_kernel void @test_smulhi24_i64(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { ; SI-LABEL: test_smulhi24_i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -113,7 +113,7 @@ define amdgpu_kernel void @test_smulhi24_i64(ptr addrspace(1) %out, i32 %a, i32 ; ; VI-LABEL: test_smulhi24_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -126,7 +126,7 @@ define amdgpu_kernel void @test_smulhi24_i64(ptr addrspace(1) %out, i32 %a, i32 ; ; GFX9-LABEL: test_smulhi24_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -274,26 +274,26 @@ define <2 x i64> @test_smul48_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { define amdgpu_kernel void @test_smul24_i64(ptr addrspace(1) %out, [8 x i32], i32 %a, [8 x i32], i32 %b) #0 { ; SI-LABEL: test_smul24_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s2, s[0:1], 0x13 -; SI-NEXT: s_load_dword s0, s[0:1], 0x1c -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0x13 +; SI-NEXT: s_load_dword s5, s[2:3], 0x1c +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_i32 s1, s2, 0x180000 -; SI-NEXT: s_bfe_i32 s0, s0, 0x180000 -; SI-NEXT: v_mov_b32_e32 v0, s1 -; SI-NEXT: s_mul_i32 s1, s0, s1 -; SI-NEXT: v_mul_hi_i32_i24_e32 v1, s0, v0 -; SI-NEXT: v_mov_b32_e32 v0, s1 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_bfe_i32 s4, s4, 0x180000 +; SI-NEXT: s_bfe_i32 s5, s5, 0x180000 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: s_mul_i32 s4, s5, s4 +; SI-NEXT: v_mul_hi_i32_i24_e32 v1, s5, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_smul24_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x4c -; VI-NEXT: s_load_dword s5, s[0:1], 0x70 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x4c +; VI-NEXT: s_load_dword s5, s[2:3], 0x70 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -307,19 +307,19 @@ define amdgpu_kernel void @test_smul24_i64(ptr addrspace(1) %out, [8 x i32], i32 ; ; GFX9-LABEL: test_smul24_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x4c -; GFX9-NEXT: s_load_dword s3, s[0:1], 0x70 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x4c +; GFX9-NEXT: s_load_dword s5, s[2:3], 0x70 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s0, s2, 0x180000 -; GFX9-NEXT: s_bfe_i32 s1, s3, 0x180000 -; GFX9-NEXT: s_mul_hi_i32 s2, s1, s0 -; GFX9-NEXT: s_mul_i32 s1, s1, s0 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX9-NEXT: s_bfe_i32 s4, s4, 0x180000 +; GFX9-NEXT: s_bfe_i32 s5, s5, 0x180000 +; GFX9-NEXT: s_mul_hi_i32 s6, s5, s4 +; GFX9-NEXT: s_mul_i32 s5, s5, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: test_smul24_i64: @@ -376,8 +376,8 @@ define amdgpu_kernel void @test_smul24_i64(ptr addrspace(1) %out, [8 x i32], i32 define amdgpu_kernel void @test_smul24_i64_square(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { ; SI-LABEL: test_smul24_i64_square: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -390,8 +390,8 @@ define amdgpu_kernel void @test_smul24_i64_square(ptr addrspace(1) %out, i32 %a, ; ; VI-LABEL: test_smul24_i64_square: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -403,17 +403,17 @@ define amdgpu_kernel void @test_smul24_i64_square(ptr addrspace(1) %out, i32 %a, ; ; GFX9-LABEL: test_smul24_i64_square: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s0, s2, 0x180000 -; GFX9-NEXT: s_mul_hi_i32 s1, s0, s0 -; GFX9-NEXT: s_mul_i32 s0, s0, s0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX9-NEXT: s_bfe_i32 s4, s4, 0x180000 +; GFX9-NEXT: s_mul_hi_i32 s5, s4, s4 +; GFX9-NEXT: s_mul_i32 s4, s4, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: test_smul24_i64_square: @@ -463,33 +463,33 @@ define amdgpu_kernel void @test_smul24_i64_square(ptr addrspace(1) %out, i32 %a, define amdgpu_kernel void @test_smul24_i33(ptr addrspace(1) %out, i33 %a, i33 %b) #0 { ; SI-LABEL: test_smul24_i33: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dword s0, s[0:1], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dword s6, s[2:3], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s1, s2, 8 -; SI-NEXT: s_lshl_b32 s3, s0, 8 -; SI-NEXT: s_ashr_i64 s[2:3], s[2:3], 40 -; SI-NEXT: s_ashr_i64 s[0:1], s[0:1], 40 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: s_mul_i32 s1, s0, s2 -; SI-NEXT: v_mul_hi_i32_i24_e32 v1, s0, v0 -; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: s_lshl_b32 s5, s4, 8 +; SI-NEXT: s_lshl_b32 s7, s6, 8 +; SI-NEXT: s_ashr_i64 s[6:7], s[6:7], 40 +; SI-NEXT: s_ashr_i64 s[4:5], s[4:5], 40 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: s_mul_i32 s5, s4, s6 +; SI-NEXT: v_mul_hi_i32_i24_e32 v1, s4, v0 +; SI-NEXT: v_mov_b32_e32 v0, s5 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 31 ; SI-NEXT: v_ashr_i64 v[0:1], v[0:1], 31 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_smul24_i33: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dword s4, s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dword s5, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s3, s2, 8 -; VI-NEXT: s_lshl_b32 s5, s4, 8 +; VI-NEXT: s_lshl_b32 s3, s4, 8 +; VI-NEXT: s_lshl_b32 s5, s5, 8 ; VI-NEXT: s_ashr_i64 s[4:5], s[4:5], 40 ; VI-NEXT: s_ashr_i64 s[2:3], s[2:3], 40 ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -504,23 +504,23 @@ define amdgpu_kernel void @test_smul24_i33(ptr addrspace(1) %out, i33 %a, i33 %b ; ; GFX9-LABEL: test_smul24_i33: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s3, s[0:1], 0x34 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x34 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s1, s2, 8 -; GFX9-NEXT: s_ashr_i64 s[0:1], s[0:1], 40 -; GFX9-NEXT: s_lshl_b32 s1, s3, 8 -; GFX9-NEXT: s_ashr_i64 s[2:3], s[0:1], 40 -; GFX9-NEXT: s_mul_hi_i32 s1, s0, s2 -; GFX9-NEXT: s_mul_i32 s0, s0, s2 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 31 -; GFX9-NEXT: s_ashr_i64 s[0:1], s[0:1], 31 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX9-NEXT: s_lshl_b32 s5, s4, 8 +; GFX9-NEXT: s_ashr_i64 s[4:5], s[4:5], 40 +; GFX9-NEXT: s_lshl_b32 s5, s6, 8 +; GFX9-NEXT: s_ashr_i64 s[6:7], s[4:5], 40 +; GFX9-NEXT: s_mul_hi_i32 s5, s4, s6 +; GFX9-NEXT: s_mul_i32 s4, s4, s6 +; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 31 +; GFX9-NEXT: s_ashr_i64 s[4:5], s[4:5], 31 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: test_smul24_i33: @@ -580,9 +580,9 @@ entry: define amdgpu_kernel void @test_smulhi24_i33(ptr addrspace(1) %out, i33 %a, i33 %b) { ; SI-LABEL: test_smulhi24_i33: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xd -; SI-NEXT: s_load_dword s5, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xd +; SI-NEXT: s_load_dword s5, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -594,9 +594,9 @@ define amdgpu_kernel void @test_smulhi24_i33(ptr addrspace(1) %out, i33 %a, i33 ; ; VI-LABEL: test_smulhi24_i33: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x34 -; VI-NEXT: s_load_dword s5, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x34 +; VI-NEXT: s_load_dword s5, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -608,20 +608,20 @@ define amdgpu_kernel void @test_smulhi24_i33(ptr addrspace(1) %out, i33 %a, i33 ; ; GFX9-LABEL: test_smulhi24_i33: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s3, s[0:1], 0x34 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x34 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s1, s2, 8 -; GFX9-NEXT: s_ashr_i64 s[0:1], s[0:1], 40 -; GFX9-NEXT: s_lshl_b32 s1, s3, 8 -; GFX9-NEXT: s_ashr_i64 s[2:3], s[0:1], 40 -; GFX9-NEXT: s_mul_hi_i32 s0, s0, s2 -; GFX9-NEXT: s_and_b32 s0, s0, 1 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_lshl_b32 s5, s4, 8 +; GFX9-NEXT: s_ashr_i64 s[4:5], s[4:5], 40 +; GFX9-NEXT: s_lshl_b32 s5, s6, 8 +; GFX9-NEXT: s_ashr_i64 s[6:7], s[4:5], 40 +; GFX9-NEXT: s_mul_hi_i32 s4, s4, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 1 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: test_smulhi24_i33: @@ -672,15 +672,15 @@ entry: define amdgpu_kernel void @simplify_i24_crash(ptr addrspace(1) %out, i32 %arg0, <2 x i32> %arg1, <2 x i32> %arg2) { ; SI-LABEL: simplify_i24_crash: ; SI: ; %bb.0: ; %bb -; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dword s0, s[2:3], 0xb ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s2, 0 +; SI-NEXT: s_cmp_lg_u32 s0, 0 ; SI-NEXT: s_cbranch_scc0 .LBB8_2 ; SI-NEXT: ; %bb.1: ; %bb7 ; SI-NEXT: s_endpgm ; SI-NEXT: .LBB8_2: ; %bb11 -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bfe_i32 s2, s4, 0x180000 @@ -694,15 +694,15 @@ define amdgpu_kernel void @simplify_i24_crash(ptr addrspace(1) %out, i32 %arg0, ; ; VI-LABEL: simplify_i24_crash: ; VI: ; %bb.0: ; %bb -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dword s0, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s2, 0 +; VI-NEXT: s_cmp_lg_u32 s0, 0 ; VI-NEXT: s_cbranch_scc0 .LBB8_2 ; VI-NEXT: ; %bb.1: ; %bb7 ; VI-NEXT: s_endpgm ; VI-NEXT: .LBB8_2: ; %bb11 -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -716,24 +716,24 @@ define amdgpu_kernel void @simplify_i24_crash(ptr addrspace(1) %out, i32 %arg0, ; ; GFX9-LABEL: simplify_i24_crash: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_cmp_lg_u32 s2, 0 +; GFX9-NEXT: s_cmp_lg_u32 s0, 0 ; GFX9-NEXT: s_cbranch_scc0 .LBB8_2 ; GFX9-NEXT: ; %bb.1: ; %bb7 ; GFX9-NEXT: s_endpgm ; GFX9-NEXT: .LBB8_2: ; %bb11 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s11, 0xf000 -; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s0, s4, 0x180000 -; GFX9-NEXT: s_bfe_i32 s1, s6, 0x180000 -; GFX9-NEXT: s_mul_i32 s0, s0, s1 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GFX9-NEXT: s_bfe_i32 s4, s4, 0x180000 +; GFX9-NEXT: s_bfe_i32 s5, s6, 0x180000 +; GFX9-NEXT: s_mul_i32 s4, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: simplify_i24_crash: diff --git a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll index 698a54de108f7..342f36b6fa622 100644 --- a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll +++ b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll @@ -9,7 +9,7 @@ declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone define amdgpu_kernel void @test_umul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; SI-LABEL: test_umul24_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_and_b32 s2, s2, 0xffffff @@ -24,7 +24,7 @@ define amdgpu_kernel void @test_umul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b ; ; VI-LABEL: test_umul24_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -39,7 +39,7 @@ define amdgpu_kernel void @test_umul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b ; ; GFX9-LABEL: test_umul24_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -64,13 +64,13 @@ entry: define amdgpu_kernel void @test_umul24_i16_sext(ptr addrspace(1) %out, i16 %a, i16 %b) { ; SI-LABEL: test_umul24_i16_sext: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s2, 16 -; SI-NEXT: s_mul_i32 s2, s2, s4 -; SI-NEXT: s_sext_i32_i16 s4, s2 +; SI-NEXT: s_lshr_b32 s2, s4, 16 +; SI-NEXT: s_mul_i32 s4, s4, s2 +; SI-NEXT: s_sext_i32_i16 s4, s4 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -78,8 +78,8 @@ define amdgpu_kernel void @test_umul24_i16_sext(ptr addrspace(1) %out, i16 %a, i ; ; VI-LABEL: test_umul24_i16_sext: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -92,16 +92,16 @@ define amdgpu_kernel void @test_umul24_i16_sext(ptr addrspace(1) %out, i16 %a, i ; ; GFX9-LABEL: test_umul24_i16_sext: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s0, s2, 16 -; GFX9-NEXT: s_mul_i32 s2, s2, s0 -; GFX9-NEXT: s_sext_i32_i16 s0, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_lshr_b32 s5, s4, 16 +; GFX9-NEXT: s_mul_i32 s4, s4, s5 +; GFX9-NEXT: s_sext_i32_i16 s4, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm entry: %mul = mul i16 %a, %b @@ -113,7 +113,7 @@ entry: define amdgpu_kernel void @test_umul24_i16_vgpr_sext(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: test_umul24_i16_vgpr_sext: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 1, v0 @@ -136,7 +136,7 @@ define amdgpu_kernel void @test_umul24_i16_vgpr_sext(ptr addrspace(1) %out, ptr ; ; VI-LABEL: test_umul24_i16_vgpr_sext: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s3 @@ -158,7 +158,7 @@ define amdgpu_kernel void @test_umul24_i16_vgpr_sext(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: test_umul24_i16_vgpr_sext: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -186,13 +186,13 @@ define amdgpu_kernel void @test_umul24_i16_vgpr_sext(ptr addrspace(1) %out, ptr define amdgpu_kernel void @test_umul24_i16(ptr addrspace(1) %out, i16 %a, i16 %b) { ; SI-LABEL: test_umul24_i16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s2, 16 -; SI-NEXT: s_mul_i32 s2, s2, s4 -; SI-NEXT: s_and_b32 s4, s2, 0xffff +; SI-NEXT: s_lshr_b32 s2, s4, 16 +; SI-NEXT: s_mul_i32 s4, s4, s2 +; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -200,8 +200,8 @@ define amdgpu_kernel void @test_umul24_i16(ptr addrspace(1) %out, i16 %a, i16 %b ; ; VI-LABEL: test_umul24_i16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -214,16 +214,16 @@ define amdgpu_kernel void @test_umul24_i16(ptr addrspace(1) %out, i16 %a, i16 %b ; ; GFX9-LABEL: test_umul24_i16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s0, s2, 16 -; GFX9-NEXT: s_mul_i32 s2, s2, s0 -; GFX9-NEXT: s_and_b32 s0, s2, 0xffff -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_lshr_b32 s5, s4, 16 +; GFX9-NEXT: s_mul_i32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm entry: %mul = mul i16 %a, %b @@ -235,7 +235,7 @@ entry: define amdgpu_kernel void @test_umul24_i16_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: test_umul24_i16_vgpr: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 1, v0 @@ -258,7 +258,7 @@ define amdgpu_kernel void @test_umul24_i16_vgpr(ptr addrspace(1) %out, ptr addrs ; ; VI-LABEL: test_umul24_i16_vgpr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s3 @@ -279,7 +279,7 @@ define amdgpu_kernel void @test_umul24_i16_vgpr(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: test_umul24_i16_vgpr: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -307,8 +307,8 @@ define amdgpu_kernel void @test_umul24_i8_vgpr(ptr addrspace(1) %out, ptr addrsp ; SI-LABEL: test_umul24_i8_vgpr: ; SI: ; %bb.0: ; %entry ; SI-NEXT: v_mov_b32_e32 v3, v0 -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: v_mov_b32_e32 v4, 0 @@ -330,8 +330,8 @@ define amdgpu_kernel void @test_umul24_i8_vgpr(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: test_umul24_i8_vgpr: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v0 @@ -351,11 +351,11 @@ define amdgpu_kernel void @test_umul24_i8_vgpr(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: test_umul24_i8_vgpr: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v2, v0, s[6:7] -; GFX9-NEXT: global_load_ubyte v3, v1, s[2:3] +; GFX9-NEXT: global_load_ubyte v3, v1, s[0:1] ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -379,7 +379,7 @@ entry: define amdgpu_kernel void @test_umulhi24_i32_i64(ptr addrspace(1) %out, i32 %a, i32 %b) { ; SI-LABEL: test_umulhi24_i32_i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -392,7 +392,7 @@ define amdgpu_kernel void @test_umulhi24_i32_i64(ptr addrspace(1) %out, i32 %a, ; ; VI-LABEL: test_umulhi24_i32_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -405,7 +405,7 @@ define amdgpu_kernel void @test_umulhi24_i32_i64(ptr addrspace(1) %out, i32 %a, ; ; GFX9-LABEL: test_umulhi24_i32_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -432,9 +432,9 @@ entry: define amdgpu_kernel void @test_umulhi24(ptr addrspace(1) %out, i64 %a, i64 %b) { ; SI-LABEL: test_umulhi24: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dword s7, s[0:1], 0xd +; SI-NEXT: s_load_dword s7, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s0, s4 @@ -447,9 +447,9 @@ define amdgpu_kernel void @test_umulhi24(ptr addrspace(1) %out, i64 %a, i64 %b) ; ; VI-LABEL: test_umulhi24: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s7, s[0:1], 0x34 +; VI-NEXT: s_load_dword s7, s[2:3], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s4 @@ -462,19 +462,18 @@ define amdgpu_kernel void @test_umulhi24(ptr addrspace(1) %out, i64 %a, i64 %b) ; ; GFX9-LABEL: test_umulhi24: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_mov_b32 s11, 0xf000 +; GFX9-NEXT: s_mov_b32 s10, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s7, s[0:1], 0x34 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_and_b32 s4, s6, 0xffffff -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s5, s7, 0xffffff -; GFX9-NEXT: s_mul_hi_u32 s4, s4, s5 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_and_b32 s1, s6, 0xffffff +; GFX9-NEXT: s_and_b32 s0, s0, 0xffffff +; GFX9-NEXT: s_mul_hi_u32 s0, s1, s0 +; GFX9-NEXT: s_mov_b32 s8, s4 +; GFX9-NEXT: s_mov_b32 s9, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; GFX9-NEXT: s_endpgm entry: %a.24 = and i64 %a, 16777215 @@ -490,9 +489,9 @@ entry: define amdgpu_kernel void @test_umul24_i64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; SI-LABEL: test_umul24_i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dword s7, s[0:1], 0xd +; SI-NEXT: s_load_dword s7, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s0, s4 @@ -509,9 +508,9 @@ define amdgpu_kernel void @test_umul24_i64(ptr addrspace(1) %out, i64 %a, i64 %b ; ; VI-LABEL: test_umul24_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s7, s[0:1], 0x34 +; VI-NEXT: s_load_dword s7, s[2:3], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s4 @@ -525,21 +524,20 @@ define amdgpu_kernel void @test_umul24_i64(ptr addrspace(1) %out, i64 %a, i64 %b ; ; GFX9-LABEL: test_umul24_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s7, s[0:1], 0x34 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_and_b32 s4, s6, 0xffffff +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_mov_b32 s11, 0xf000 +; GFX9-NEXT: s_mov_b32 s10, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s5, s7, 0xffffff -; GFX9-NEXT: s_mul_hi_u32 s6, s4, s5 -; GFX9-NEXT: s_mul_i32 s4, s4, s5 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: s_and_b32 s1, s6, 0xffffff +; GFX9-NEXT: s_and_b32 s0, s0, 0xffffff +; GFX9-NEXT: s_mul_hi_u32 s2, s1, s0 +; GFX9-NEXT: s_mul_i32 s1, s1, s0 +; GFX9-NEXT: s_mov_b32 s8, s4 +; GFX9-NEXT: s_mov_b32 s9, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; GFX9-NEXT: s_endpgm entry: %tmp0 = shl i64 %a, 40 @@ -582,8 +580,8 @@ define <2 x i64> @test_umul48_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { define amdgpu_kernel void @test_umul24_i64_square(ptr addrspace(1) %out, [8 x i32], i64 %a) { ; SI-LABEL: test_umul24_i64_square: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -596,8 +594,8 @@ define amdgpu_kernel void @test_umul24_i64_square(ptr addrspace(1) %out, [8 x i3 ; ; VI-LABEL: test_umul24_i64_square: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -608,17 +606,17 @@ define amdgpu_kernel void @test_umul24_i64_square(ptr addrspace(1) %out, [8 x i3 ; ; GFX9-LABEL: test_umul24_i64_square: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x4c -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x4c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, s2, 0xffffff -; GFX9-NEXT: s_mul_hi_u32 s1, s0, s0 -; GFX9-NEXT: s_mul_i32 s0, s0, s0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffffff +; GFX9-NEXT: s_mul_hi_u32 s5, s4, s4 +; GFX9-NEXT: s_mul_i32 s4, s4, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm entry: %tmp0 = shl i64 %a, 40 @@ -631,7 +629,7 @@ entry: define amdgpu_kernel void @test_umulhi16_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; SI-LABEL: test_umulhi16_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_and_b32 s2, s2, 0xffff @@ -647,7 +645,7 @@ define amdgpu_kernel void @test_umulhi16_i32(ptr addrspace(1) %out, i32 %a, i32 ; ; VI-LABEL: test_umulhi16_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -663,7 +661,7 @@ define amdgpu_kernel void @test_umulhi16_i32(ptr addrspace(1) %out, i32 %a, i32 ; ; GFX9-LABEL: test_umulhi16_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s0, s6, 0xffff @@ -685,27 +683,27 @@ entry: define amdgpu_kernel void @test_umul24_i33(ptr addrspace(1) %out, i33 %a, i33 %b) { ; SI-LABEL: test_umul24_i33: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dword s0, s[0:1], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dword s5, s[2:3], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s1, s2, 0xffffff -; SI-NEXT: s_and_b32 s3, s0, 0xffffff -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mul_hi_u32_u24_e32 v0, s2, v0 -; SI-NEXT: s_mul_i32 s1, s1, s3 +; SI-NEXT: s_and_b32 s6, s4, 0xffffff +; SI-NEXT: s_and_b32 s7, s5, 0xffffff +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_mul_hi_u32_u24_e32 v0, s4, v0 +; SI-NEXT: s_mul_i32 s6, s6, s7 ; SI-NEXT: v_and_b32_e32 v1, 1, v0 -; SI-NEXT: v_mov_b32_e32 v0, s1 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_umul24_i33: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x34 -; VI-NEXT: s_load_dword s5, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x34 +; VI-NEXT: s_load_dword s5, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -718,20 +716,20 @@ define amdgpu_kernel void @test_umul24_i33(ptr addrspace(1) %out, i33 %a, i33 %b ; ; GFX9-LABEL: test_umul24_i33: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_load_dword s3, s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s5, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, s2, 0xffffff -; GFX9-NEXT: s_and_b32 s1, s3, 0xffffff -; GFX9-NEXT: s_mul_i32 s2, s0, s1 -; GFX9-NEXT: s_mul_hi_u32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s0, s0, 1 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffffff +; GFX9-NEXT: s_and_b32 s5, s5, 0xffffff +; GFX9-NEXT: s_mul_i32 s6, s4, s5 +; GFX9-NEXT: s_mul_hi_u32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s4, s4, 1 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm entry: %tmp0 = shl i33 %a, 9 @@ -747,9 +745,9 @@ entry: define amdgpu_kernel void @test_umulhi24_i33(ptr addrspace(1) %out, i33 %a, i33 %b) { ; SI-LABEL: test_umulhi24_i33: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xd -; SI-NEXT: s_load_dword s5, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xd +; SI-NEXT: s_load_dword s5, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -761,9 +759,9 @@ define amdgpu_kernel void @test_umulhi24_i33(ptr addrspace(1) %out, i33 %a, i33 ; ; VI-LABEL: test_umulhi24_i33: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x34 -; VI-NEXT: s_load_dword s5, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x34 +; VI-NEXT: s_load_dword s5, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -775,18 +773,18 @@ define amdgpu_kernel void @test_umulhi24_i33(ptr addrspace(1) %out, i33 %a, i33 ; ; GFX9-LABEL: test_umulhi24_i33: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_load_dword s3, s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s5, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, s2, 0xffffff -; GFX9-NEXT: s_and_b32 s1, s3, 0xffffff -; GFX9-NEXT: s_mul_hi_u32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s0, s0, 1 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffffff +; GFX9-NEXT: s_and_b32 s5, s5, 0xffffff +; GFX9-NEXT: s_mul_hi_u32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s4, s4, 1 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm entry: %tmp0 = shl i33 %a, 9 diff --git a/llvm/test/CodeGen/AMDGPU/or.ll b/llvm/test/CodeGen/AMDGPU/or.ll index 0473f803bfb30..eff80236d9866 100644 --- a/llvm/test/CodeGen/AMDGPU/or.ll +++ b/llvm/test/CodeGen/AMDGPU/or.ll @@ -6,7 +6,7 @@ define amdgpu_kernel void @or_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: or_v2i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -25,7 +25,7 @@ define amdgpu_kernel void @or_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; GFX8-LABEL: or_v2i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_mov_b32 s10, s6 @@ -70,7 +70,7 @@ define amdgpu_kernel void @or_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) define amdgpu_kernel void @or_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: or_v4i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -92,7 +92,7 @@ define amdgpu_kernel void @or_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; GFX8-LABEL: or_v4i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_mov_b32 s10, s6 @@ -143,7 +143,7 @@ define amdgpu_kernel void @or_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) define amdgpu_kernel void @scalar_or_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; GFX6-LABEL: scalar_or_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -156,7 +156,7 @@ define amdgpu_kernel void @scalar_or_i32(ptr addrspace(1) %out, i32 %a, i32 %b) ; ; GFX8-LABEL: scalar_or_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -185,40 +185,40 @@ define amdgpu_kernel void @scalar_or_i32(ptr addrspace(1) %out, i32 %a, i32 %b) define amdgpu_kernel void @vector_or_i32(ptr addrspace(1) %out, ptr addrspace(1) %a, i32 %b) { ; GFX6-LABEL: vector_or_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX6-NEXT: s_load_dword s12, s[0:1], 0xd -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: s_mov_b32 s10, s2 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s12, s[2:3], 0xd +; GFX6-NEXT: s_mov_b32 s11, 0xf000 +; GFX6-NEXT: s_mov_b32 s10, -1 +; GFX6-NEXT: s_mov_b32 s2, s10 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s11, s3 -; GFX6-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: s_mov_b32 s0, s6 +; GFX6-NEXT: s_mov_b32 s1, s7 +; GFX6-NEXT: s_mov_b32 s3, s11 +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_mov_b32 s8, s4 +; GFX6-NEXT: s_mov_b32 s9, s5 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_or_b32_e32 v0, s12, v0 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: vector_or_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dword s12, s[0:1], 0x34 -; GFX8-NEXT: s_mov_b32 s3, 0xf000 -; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: s_mov_b32 s10, s2 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dword s12, s[2:3], 0x34 +; GFX8-NEXT: s_mov_b32 s11, 0xf000 +; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_mov_b32 s2, s10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s11, s3 -; GFX8-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GFX8-NEXT: s_mov_b32 s0, s4 -; GFX8-NEXT: s_mov_b32 s1, s5 +; GFX8-NEXT: s_mov_b32 s0, s6 +; GFX8-NEXT: s_mov_b32 s1, s7 +; GFX8-NEXT: s_mov_b32 s3, s11 +; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GFX8-NEXT: s_mov_b32 s8, s4 +; GFX8-NEXT: s_mov_b32 s9, s5 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_or_b32_e32 v0, s12, v0 -; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; GFX8-NEXT: s_endpgm ; ; EG-LABEL: vector_or_i32: @@ -246,8 +246,8 @@ define amdgpu_kernel void @vector_or_i32(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @scalar_or_literal_i32(ptr addrspace(1) %out, i32 %a) { ; GFX6-LABEL: scalar_or_literal_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -258,8 +258,8 @@ define amdgpu_kernel void @scalar_or_literal_i32(ptr addrspace(1) %out, i32 %a) ; ; GFX8-LABEL: scalar_or_literal_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -286,8 +286,8 @@ define amdgpu_kernel void @scalar_or_literal_i32(ptr addrspace(1) %out, i32 %a) define amdgpu_kernel void @scalar_or_literal_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) { ; GFX6-LABEL: scalar_or_literal_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -300,8 +300,8 @@ define amdgpu_kernel void @scalar_or_literal_i64(ptr addrspace(1) %out, [8 x i32 ; ; GFX8-LABEL: scalar_or_literal_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x4c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -332,43 +332,43 @@ define amdgpu_kernel void @scalar_or_literal_i64(ptr addrspace(1) %out, [8 x i32 define amdgpu_kernel void @scalar_or_literal_multi_use_i64(ptr addrspace(1) %out, [8 x i32], i64 %a, [8 x i32], i64 %b) { ; GFX6-LABEL: scalar_or_literal_multi_use_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x1d +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x1d ; GFX6-NEXT: s_movk_i32 s8, 0x3039 ; GFX6-NEXT: s_mov_b32 s9, 0xf237b -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, s2 -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: v_mov_b32_e32 v1, s3 -; GFX6-NEXT: s_add_u32 s0, s0, 0x3039 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; GFX6-NEXT: s_addc_u32 s1, s1, 0xf237b +; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX6-NEXT: s_add_u32 s0, s6, 0x3039 +; GFX6-NEXT: s_addc_u32 s1, s7, 0xf237b ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: scalar_or_literal_multi_use_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x74 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x74 ; GFX8-NEXT: s_movk_i32 s8, 0x3039 ; GFX8-NEXT: s_mov_b32 s9, 0xf237b ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] -; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: s_mov_b32 s6, -1 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: s_add_u32 s0, s0, 0x3039 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_add_u32 s0, s2, 0x3039 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; GFX8-NEXT: s_addc_u32 s1, s1, 0xf237b +; GFX8-NEXT: s_addc_u32 s1, s3, 0xf237b ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -408,8 +408,8 @@ define amdgpu_kernel void @scalar_or_literal_multi_use_i64(ptr addrspace(1) %out define amdgpu_kernel void @scalar_or_inline_imm_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) { ; GFX6-LABEL: scalar_or_inline_imm_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -421,8 +421,8 @@ define amdgpu_kernel void @scalar_or_inline_imm_i64(ptr addrspace(1) %out, [8 x ; ; GFX8-LABEL: scalar_or_inline_imm_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x4c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -451,44 +451,44 @@ define amdgpu_kernel void @scalar_or_inline_imm_i64(ptr addrspace(1) %out, [8 x define amdgpu_kernel void @scalar_or_inline_imm_multi_use_i64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX6-LABEL: scalar_or_inline_imm_multi_use_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX6-NEXT: s_mov_b32 s11, 0xf000 +; GFX6-NEXT: s_mov_b32 s10, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_or_b32 s4, s6, 63 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_or_b32 s2, s6, 63 +; GFX6-NEXT: s_mov_b32 s8, s4 +; GFX6-NEXT: s_mov_b32 s9, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX6-NEXT: s_add_u32 s0, s8, 63 -; GFX6-NEXT: s_addc_u32 s1, s9, 0 +; GFX6-NEXT: s_add_u32 s0, s0, 63 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: scalar_or_inline_imm_multi_use_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; GFX8-NEXT: s_mov_b32 s3, 0xf000 -; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_mov_b32 s11, 0xf000 +; GFX8-NEXT: s_mov_b32 s10, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s0, s4 -; GFX8-NEXT: s_or_b32 s4, s6, 63 -; GFX8-NEXT: s_mov_b32 s1, s5 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_or_b32 s2, s6, 63 +; GFX8-NEXT: s_mov_b32 s8, s4 +; GFX8-NEXT: s_mov_b32 s9, s5 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX8-NEXT: s_add_u32 s0, s8, 63 -; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: s_add_u32 s0, s0, 63 +; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_endpgm ; @@ -521,8 +521,8 @@ define amdgpu_kernel void @scalar_or_inline_imm_multi_use_i64(ptr addrspace(1) % define amdgpu_kernel void @scalar_or_neg_inline_imm_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) { ; GFX6-LABEL: scalar_or_neg_inline_imm_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[0:1], 0x13 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s4, s[2:3], 0x13 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mov_b32_e32 v1, -1 @@ -534,8 +534,8 @@ define amdgpu_kernel void @scalar_or_neg_inline_imm_i64(ptr addrspace(1) %out, [ ; ; GFX8-LABEL: scalar_or_neg_inline_imm_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x4c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dword s4, s[2:3], 0x4c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: v_mov_b32_e32 v1, -1 @@ -565,7 +565,7 @@ define amdgpu_kernel void @scalar_or_neg_inline_imm_i64(ptr addrspace(1) %out, [ define amdgpu_kernel void @vector_or_literal_i32(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { ; GFX6-LABEL: vector_or_literal_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -583,7 +583,7 @@ define amdgpu_kernel void @vector_or_literal_i32(ptr addrspace(1) %out, ptr addr ; ; GFX8-LABEL: vector_or_literal_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_mov_b32 s10, s6 @@ -624,7 +624,7 @@ define amdgpu_kernel void @vector_or_literal_i32(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @vector_or_inline_immediate_i32(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { ; GFX6-LABEL: vector_or_inline_immediate_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -642,7 +642,7 @@ define amdgpu_kernel void @vector_or_inline_immediate_i32(ptr addrspace(1) %out, ; ; GFX8-LABEL: vector_or_inline_immediate_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_mov_b32 s10, s6 @@ -683,8 +683,8 @@ define amdgpu_kernel void @vector_or_inline_immediate_i32(ptr addrspace(1) %out, define amdgpu_kernel void @scalar_or_i64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX6-LABEL: scalar_or_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -698,8 +698,8 @@ define amdgpu_kernel void @scalar_or_i64(ptr addrspace(1) %out, i64 %a, i64 %b) ; ; GFX8-LABEL: scalar_or_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -730,48 +730,48 @@ define amdgpu_kernel void @scalar_or_i64(ptr addrspace(1) %out, i64 %a, i64 %b) define amdgpu_kernel void @vector_or_i64(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { ; GFX6-LABEL: vector_or_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: s_mov_b32 s10, s2 -; GFX6-NEXT: s_mov_b32 s11, s3 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX6-NEXT: s_mov_b32 s11, 0xf000 +; GFX6-NEXT: s_mov_b32 s10, -1 +; GFX6-NEXT: s_mov_b32 s2, s10 +; GFX6-NEXT: s_mov_b32 s3, s11 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s12, s6 ; GFX6-NEXT: s_mov_b32 s13, s7 -; GFX6-NEXT: s_mov_b32 s14, s2 -; GFX6-NEXT: s_mov_b32 s15, s3 -; GFX6-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; GFX6-NEXT: s_mov_b32 s14, s10 +; GFX6-NEXT: s_mov_b32 s15, s11 +; GFX6-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: s_mov_b32 s8, s4 +; GFX6-NEXT: s_mov_b32 s9, s5 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: vector_or_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; GFX8-NEXT: s_mov_b32 s3, 0xf000 -; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: s_mov_b32 s10, s2 -; GFX8-NEXT: s_mov_b32 s11, s3 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_mov_b32 s11, 0xf000 +; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_mov_b32 s2, s10 +; GFX8-NEXT: s_mov_b32 s3, s11 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s12, s6 ; GFX8-NEXT: s_mov_b32 s13, s7 -; GFX8-NEXT: s_mov_b32 s14, s2 -; GFX8-NEXT: s_mov_b32 s15, s3 -; GFX8-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; GFX8-NEXT: s_mov_b32 s14, s10 +; GFX8-NEXT: s_mov_b32 s15, s11 +; GFX8-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 -; GFX8-NEXT: s_mov_b32 s0, s4 -; GFX8-NEXT: s_mov_b32 s1, s5 +; GFX8-NEXT: s_mov_b32 s8, s4 +; GFX8-NEXT: s_mov_b32 s9, s5 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; GFX8-NEXT: s_endpgm ; ; EG-LABEL: vector_or_i64: @@ -803,42 +803,42 @@ define amdgpu_kernel void @vector_or_i64(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @scalar_vector_or_i64(ptr addrspace(1) %out, ptr addrspace(1) %a, i64 %b) { ; GFX6-LABEL: scalar_vector_or_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0xd -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: s_mov_b32 s10, s2 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0xd +; GFX6-NEXT: s_mov_b32 s11, 0xf000 +; GFX6-NEXT: s_mov_b32 s10, -1 +; GFX6-NEXT: s_mov_b32 s2, s10 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s11, s3 -; GFX6-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: s_mov_b32 s0, s6 +; GFX6-NEXT: s_mov_b32 s1, s7 +; GFX6-NEXT: s_mov_b32 s3, s11 +; GFX6-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; GFX6-NEXT: s_mov_b32 s8, s4 +; GFX6-NEXT: s_mov_b32 s9, s5 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_or_b32_e32 v0, s12, v0 ; GFX6-NEXT: v_or_b32_e32 v1, s13, v1 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: scalar_vector_or_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x34 -; GFX8-NEXT: s_mov_b32 s3, 0xf000 -; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: s_mov_b32 s10, s2 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x34 +; GFX8-NEXT: s_mov_b32 s11, 0xf000 +; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_mov_b32 s2, s10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s11, s3 -; GFX8-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; GFX8-NEXT: s_mov_b32 s0, s4 -; GFX8-NEXT: s_mov_b32 s1, s5 +; GFX8-NEXT: s_mov_b32 s0, s6 +; GFX8-NEXT: s_mov_b32 s1, s7 +; GFX8-NEXT: s_mov_b32 s3, s11 +; GFX8-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; GFX8-NEXT: s_mov_b32 s8, s4 +; GFX8-NEXT: s_mov_b32 s9, s5 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_or_b32_e32 v0, s12, v0 ; GFX8-NEXT: v_or_b32_e32 v1, s13, v1 -; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; GFX8-NEXT: s_endpgm ; ; EG-LABEL: scalar_vector_or_i64: @@ -867,7 +867,7 @@ define amdgpu_kernel void @scalar_vector_or_i64(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @vector_or_i64_loadimm(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { ; GFX6-LABEL: vector_or_i64_loadimm: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -886,7 +886,7 @@ define amdgpu_kernel void @vector_or_i64_loadimm(ptr addrspace(1) %out, ptr addr ; ; GFX8-LABEL: vector_or_i64_loadimm: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_mov_b32 s10, s6 @@ -931,7 +931,7 @@ define amdgpu_kernel void @vector_or_i64_loadimm(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @vector_or_i64_imm(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { ; GFX6-LABEL: vector_or_i64_imm: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -949,7 +949,7 @@ define amdgpu_kernel void @vector_or_i64_imm(ptr addrspace(1) %out, ptr addrspac ; ; GFX8-LABEL: vector_or_i64_imm: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_mov_b32 s10, s6 @@ -990,7 +990,7 @@ define amdgpu_kernel void @vector_or_i64_imm(ptr addrspace(1) %out, ptr addrspac define amdgpu_kernel void @vector_or_i64_neg_inline_imm(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { ; GFX6-LABEL: vector_or_i64_neg_inline_imm: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -1009,7 +1009,7 @@ define amdgpu_kernel void @vector_or_i64_neg_inline_imm(ptr addrspace(1) %out, p ; ; GFX8-LABEL: vector_or_i64_neg_inline_imm: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_mov_b32 s10, s6 @@ -1053,7 +1053,7 @@ define amdgpu_kernel void @vector_or_i64_neg_inline_imm(ptr addrspace(1) %out, p define amdgpu_kernel void @vector_or_i64_neg_literal(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { ; GFX6-LABEL: vector_or_i64_neg_literal: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -1072,7 +1072,7 @@ define amdgpu_kernel void @vector_or_i64_neg_literal(ptr addrspace(1) %out, ptr ; ; GFX8-LABEL: vector_or_i64_neg_literal: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_mov_b32 s10, s6 @@ -1116,9 +1116,9 @@ define amdgpu_kernel void @vector_or_i64_neg_literal(ptr addrspace(1) %out, ptr define amdgpu_kernel void @trunc_i64_or_to_i32(ptr addrspace(1) %out, [8 x i32], i64 %a, [8 x i32], i64 %b) { ; GFX6-LABEL: trunc_i64_or_to_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[0:1], 0x13 -; GFX6-NEXT: s_load_dword s5, s[0:1], 0x1d -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s4, s[2:3], 0x13 +; GFX6-NEXT: s_load_dword s5, s[2:3], 0x1d +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -1129,9 +1129,9 @@ define amdgpu_kernel void @trunc_i64_or_to_i32(ptr addrspace(1) %out, [8 x i32], ; ; GFX8-LABEL: trunc_i64_or_to_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x4c -; GFX8-NEXT: s_load_dword s5, s[0:1], 0x74 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dword s4, s[2:3], 0x4c +; GFX8-NEXT: s_load_dword s5, s[2:3], 0x74 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1159,21 +1159,21 @@ define amdgpu_kernel void @trunc_i64_or_to_i32(ptr addrspace(1) %out, [8 x i32], define amdgpu_kernel void @or_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; GFX6-LABEL: or_i1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: s_mov_b32 s10, s2 -; GFX6-NEXT: s_mov_b32 s11, s3 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX6-NEXT: s_mov_b32 s11, 0xf000 +; GFX6-NEXT: s_mov_b32 s10, -1 +; GFX6-NEXT: s_mov_b32 s2, s10 +; GFX6-NEXT: s_mov_b32 s3, s11 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s12, s6 ; GFX6-NEXT: s_mov_b32 s13, s7 -; GFX6-NEXT: s_mov_b32 s14, s2 -; GFX6-NEXT: s_mov_b32 s15, s3 -; GFX6-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GFX6-NEXT: s_mov_b32 s14, s10 +; GFX6-NEXT: s_mov_b32 s15, s11 +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; GFX6-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: s_mov_b32 s8, s4 +; GFX6-NEXT: s_mov_b32 s9, s5 ; GFX6-NEXT: s_waitcnt vmcnt(1) ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1181,26 +1181,26 @@ define amdgpu_kernel void @or_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, p ; GFX6-NEXT: v_max_f32_e32 v0, v1, v0 ; GFX6-NEXT: v_cmp_le_f32_e32 vcc, 0, v0 ; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: or_i1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; GFX8-NEXT: s_mov_b32 s3, 0xf000 -; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: s_mov_b32 s10, s2 -; GFX8-NEXT: s_mov_b32 s11, s3 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_mov_b32 s11, 0xf000 +; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_mov_b32 s2, s10 +; GFX8-NEXT: s_mov_b32 s3, s11 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s12, s6 ; GFX8-NEXT: s_mov_b32 s13, s7 -; GFX8-NEXT: s_mov_b32 s14, s2 -; GFX8-NEXT: s_mov_b32 s15, s3 -; GFX8-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GFX8-NEXT: s_mov_b32 s14, s10 +; GFX8-NEXT: s_mov_b32 s15, s11 +; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; GFX8-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; GFX8-NEXT: s_mov_b32 s0, s4 -; GFX8-NEXT: s_mov_b32 s1, s5 +; GFX8-NEXT: s_mov_b32 s8, s4 +; GFX8-NEXT: s_mov_b32 s9, s5 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -1208,7 +1208,7 @@ define amdgpu_kernel void @or_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, p ; GFX8-NEXT: v_max_f32_e32 v0, v1, v0 ; GFX8-NEXT: v_cmp_le_f32_e32 vcc, 0, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; GFX8-NEXT: s_endpgm ; ; EG-LABEL: or_i1: @@ -1244,8 +1244,8 @@ define amdgpu_kernel void @or_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, p define amdgpu_kernel void @s_or_i1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c, i32 %d) { ; GFX6-LABEL: s_or_i1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -1260,8 +1260,8 @@ define amdgpu_kernel void @s_or_i1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c ; ; GFX8-LABEL: s_or_i1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll index e076df97e1ba4..21fd193e8db54 100644 --- a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll +++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll @@ -20,21 +20,23 @@ define amdgpu_kernel void @ptr1_i8_kernel_preload_arg(ptr addrspace(1) %out, i8 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: ptr1_i8_kernel_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-2-NEXT: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_and_b32 s0, s4, 0xff +; GFX940-PRELOAD-2: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-2-NEXT: s_and_b32 s0, s4, 0xff ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 ; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; ; GFX940-PRELOAD-8-LABEL: ptr1_i8_kernel_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-8-NEXT: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_and_b32 s0, s4, 0xff +; GFX940-PRELOAD-8: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-8-NEXT: s_and_b32 s0, s4, 0xff ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 ; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm @@ -51,23 +53,25 @@ define amdgpu_kernel void @ptr1_i8_kernel_preload_arg(ptr addrspace(1) %out, i8 ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; ; GFX90a-PRELOAD-2-LABEL: ptr1_i8_kernel_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-2-NEXT: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_and_b32 s0, s8, 0xff +; GFX90a-PRELOAD-2: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-2-NEXT: s_and_b32 s2, s2, 0xff +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; ; GFX90a-PRELOAD-8-LABEL: ptr1_i8_kernel_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-8-NEXT: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_and_b32 s0, s8, 0xff +; GFX90a-PRELOAD-8: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-8-NEXT: s_and_b32 s2, s2, 0xff +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90a-PRELOAD-8-NEXT: s_endpgm %ext = zext i8 %arg0 to i32 store i32 %ext, ptr addrspace(1) %out @@ -87,24 +91,24 @@ define amdgpu_kernel void @ptr1_i8_zext_kernel_preload_arg(ptr addrspace(1) %out ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: ptr1_i8_zext_kernel_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-2-NEXT: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_mov_b32 s0, 0xffff -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-PRELOAD-2: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-2-NEXT: v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-2-NEXT: s_and_b32 s0, s4, 0xff +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 ; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; ; GFX940-PRELOAD-8-LABEL: ptr1_i8_zext_kernel_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-8-NEXT: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_mov_b32 s0, 0xffff -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-PRELOAD-8: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-8-NEXT: v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-8-NEXT: s_and_b32 s0, s4, 0xff +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 ; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; @@ -120,25 +124,25 @@ define amdgpu_kernel void @ptr1_i8_zext_kernel_preload_arg(ptr addrspace(1) %out ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; ; GFX90a-PRELOAD-2-LABEL: ptr1_i8_zext_kernel_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-2-NEXT: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_mov_b32 s0, 0xffff -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8 +; GFX90a-PRELOAD-2: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-2-NEXT: v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-2-NEXT: s_and_b32 s2, s2, 0xff +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; ; GFX90a-PRELOAD-8-LABEL: ptr1_i8_zext_kernel_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-8-NEXT: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_mov_b32 s0, 0xffff -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s8 +; GFX90a-PRELOAD-8: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-8-NEXT: v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-8-NEXT: s_and_b32 s2, s2, 0xff +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90a-PRELOAD-8-NEXT: s_endpgm %ext = zext i8 %arg0 to i32 store i32 %ext, ptr addrspace(1) %out, align 4 @@ -158,21 +162,23 @@ define amdgpu_kernel void @ptr1_i16_kernel_preload_arg(ptr addrspace(1) %out, i1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: ptr1_i16_kernel_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-2-NEXT: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_and_b32 s0, s4, 0xffff +; GFX940-PRELOAD-2: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-2-NEXT: s_and_b32 s0, s4, 0xffff ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 ; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; ; GFX940-PRELOAD-8-LABEL: ptr1_i16_kernel_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-8-NEXT: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_and_b32 s0, s4, 0xffff +; GFX940-PRELOAD-8: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-8-NEXT: s_and_b32 s0, s4, 0xffff ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 ; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm @@ -189,23 +195,25 @@ define amdgpu_kernel void @ptr1_i16_kernel_preload_arg(ptr addrspace(1) %out, i1 ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; ; GFX90a-PRELOAD-2-LABEL: ptr1_i16_kernel_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-2-NEXT: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_and_b32 s0, s8, 0xffff +; GFX90a-PRELOAD-2: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-2-NEXT: s_and_b32 s2, s2, 0xffff +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; ; GFX90a-PRELOAD-8-LABEL: ptr1_i16_kernel_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-8-NEXT: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_and_b32 s0, s8, 0xffff +; GFX90a-PRELOAD-8: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-8-NEXT: s_and_b32 s2, s2, 0xffff +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90a-PRELOAD-8-NEXT: s_endpgm %ext = zext i16 %arg0 to i32 store i32 %ext, ptr addrspace(1) %out, align 4 @@ -224,19 +232,21 @@ define amdgpu_kernel void @ptr1_i32_kernel_preload_arg(ptr addrspace(1) %out, i3 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: ptr1_i32_kernel_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4 ; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; ; GFX940-PRELOAD-8-LABEL: ptr1_i32_kernel_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4 ; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm @@ -252,21 +262,23 @@ define amdgpu_kernel void @ptr1_i32_kernel_preload_arg(ptr addrspace(1) %out, i3 ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; ; GFX90a-PRELOAD-2-LABEL: ptr1_i32_kernel_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8 -; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; ; GFX90a-PRELOAD-8-LABEL: ptr1_i32_kernel_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s8 -; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90a-PRELOAD-8-NEXT: s_endpgm store i32 %arg0, ptr addrspace(1) %out ret void @@ -287,25 +299,27 @@ define amdgpu_kernel void @i32_ptr1_i32_kernel_preload_arg(i32 %arg0, ptr addrsp ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: i32_ptr1_i32_kernel_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-2-NEXT: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dword s0, s[0:1], 0x10 +; GFX940-PRELOAD-2: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dword s4, s[0:1], 0x10 +; GFX940-PRELOAD-2-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: s_add_i32 s0, s2, s0 +; GFX940-PRELOAD-2-NEXT: s_add_i32 s0, s5, s4 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 -; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; ; GFX940-PRELOAD-8-LABEL: i32_ptr1_i32_kernel_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-8-NEXT: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_add_i32 s0, s2, s6 +; GFX940-PRELOAD-8: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_load_dword s4, s[0:1], 0x10 +; GFX940-PRELOAD-8-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-8-NEXT: s_add_i32 s0, s5, s4 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 -; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; ; GFX90a-NO-PRELOAD-LABEL: i32_ptr1_i32_kernel_preload_arg: @@ -321,25 +335,27 @@ define amdgpu_kernel void @i32_ptr1_i32_kernel_preload_arg(i32 %arg0, ptr addrsp ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; ; GFX90a-PRELOAD-2-LABEL: i32_ptr1_i32_kernel_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-2-NEXT: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_load_dword s0, s[4:5], 0x10 +; GFX90a-PRELOAD-2: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x10 +; GFX90a-PRELOAD-2-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-2-NEXT: s_add_i32 s0, s6, s0 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[8:9] +; GFX90a-PRELOAD-2-NEXT: s_add_i32 s2, s3, s2 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; ; GFX90a-PRELOAD-8-LABEL: i32_ptr1_i32_kernel_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-8-NEXT: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_add_i32 s0, s6, s10 +; GFX90a-PRELOAD-8: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_load_dword s2, s[4:5], 0x10 +; GFX90a-PRELOAD-8-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[8:9] +; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-8-NEXT: s_add_i32 s2, s3, s2 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90a-PRELOAD-8-NEXT: s_endpgm %add = add i32 %arg0, %arg1 store i32 %add, ptr addrspace(1) %out @@ -361,27 +377,27 @@ define amdgpu_kernel void @ptr1_i16_i16_kernel_preload_arg(ptr addrspace(1) %out ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: ptr1_i16_i16_kernel_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-2-NEXT: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dword s0, s[0:1], 0x8 -; GFX940-PRELOAD-2-NEXT: s_and_b32 s1, s4, 0xffff +; GFX940-PRELOAD-2: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 16 +; GFX940-PRELOAD-2-NEXT: s_and_b32 s1, s4, 0xffff ; GFX940-PRELOAD-2-NEXT: s_add_i32 s0, s1, s0 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 ; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; ; GFX940-PRELOAD-8-LABEL: ptr1_i16_i16_kernel_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 16 ; GFX940-PRELOAD-8-NEXT: s_and_b32 s1, s4, 0xffff ; GFX940-PRELOAD-8-NEXT: s_add_i32 s0, s1, s0 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 ; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm @@ -400,29 +416,29 @@ define amdgpu_kernel void @ptr1_i16_i16_kernel_preload_arg(ptr addrspace(1) %out ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; ; GFX90a-PRELOAD-2-LABEL: ptr1_i16_i16_kernel_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-2-NEXT: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_load_dword s0, s[4:5], 0x8 -; GFX90a-PRELOAD-2-NEXT: s_and_b32 s1, s8, 0xffff +; GFX90a-PRELOAD-2: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s0, 16 -; GFX90a-PRELOAD-2-NEXT: s_add_i32 s0, s1, s0 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s3, s2, 16 +; GFX90a-PRELOAD-2-NEXT: s_and_b32 s2, s2, 0xffff +; GFX90a-PRELOAD-2-NEXT: s_add_i32 s2, s2, s3 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; ; GFX90a-PRELOAD-8-LABEL: ptr1_i16_i16_kernel_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-8-NEXT: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 16 -; GFX90a-PRELOAD-8-NEXT: s_and_b32 s1, s8, 0xffff -; GFX90a-PRELOAD-8-NEXT: s_add_i32 s0, s1, s0 +; GFX90a-PRELOAD-8: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s3, s2, 16 +; GFX90a-PRELOAD-8-NEXT: s_and_b32 s2, s2, 0xffff +; GFX90a-PRELOAD-8-NEXT: s_add_i32 s2, s2, s3 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90a-PRELOAD-8-NEXT: s_endpgm %ext = zext i16 %arg0 to i32 %ext1 = zext i16 %arg1 to i32 @@ -443,25 +459,23 @@ define amdgpu_kernel void @ptr1_v2i8_kernel_preload_arg(ptr addrspace(1) %out, < ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: ptr1_v2i8_kernel_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-2-NEXT: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 8 -; GFX940-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 -; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, 0 -; GFX940-PRELOAD-2-NEXT: global_store_short v1, v0, s[2:3] sc0 sc1 +; GFX940-PRELOAD-2: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-PRELOAD-2-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; ; GFX940-PRELOAD-8-LABEL: ptr1_v2i8_kernel_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-8-NEXT: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 8 -; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 -; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, 0 -; GFX940-PRELOAD-8-NEXT: global_store_short v1, v0, s[2:3] sc0 sc1 +; GFX940-PRELOAD-8: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-PRELOAD-8-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; ; GFX90a-NO-PRELOAD-LABEL: ptr1_v2i8_kernel_preload_arg: @@ -475,25 +489,23 @@ define amdgpu_kernel void @ptr1_v2i8_kernel_preload_arg(ptr addrspace(1) %out, < ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; ; GFX90a-PRELOAD-2-LABEL: ptr1_v2i8_kernel_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-2-NEXT: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 8 -; GFX90a-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 -; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, 0 -; GFX90a-PRELOAD-2-NEXT: global_store_short v1, v0, s[6:7] +; GFX90a-PRELOAD-2: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[0:1] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; ; GFX90a-PRELOAD-8-LABEL: ptr1_v2i8_kernel_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-8-NEXT: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 8 -; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 -; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, 0 -; GFX90a-PRELOAD-8-NEXT: global_store_short v1, v0, s[6:7] +; GFX90a-PRELOAD-8: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-8-NEXT: global_store_short v0, v1, s[0:1] ; GFX90a-PRELOAD-8-NEXT: s_endpgm store <2 x i8> %in, ptr addrspace(1) %out ret void @@ -516,32 +528,30 @@ define amdgpu_kernel void @byref_kernel_preload_arg(ptr addrspace(1) %out, ptr a ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: byref_kernel_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-2-NEXT: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x100 +; GFX940-PRELOAD-2: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x100 +; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s1 -; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s3 +; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_waitcnt vmcnt(0) -; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v2, s[2:3] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v2, s[4:5] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_waitcnt vmcnt(0) ; GFX940-PRELOAD-2-NEXT: s_endpgm ; ; GFX940-PRELOAD-8-LABEL: byref_kernel_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-8-NEXT: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x100 +; GFX940-PRELOAD-8: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x100 +; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s1 -; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s3 +; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_waitcnt vmcnt(0) -; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v2, s[2:3] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v2, s[4:5] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_waitcnt vmcnt(0) ; GFX940-PRELOAD-8-NEXT: s_endpgm ; @@ -560,32 +570,30 @@ define amdgpu_kernel void @byref_kernel_preload_arg(ptr addrspace(1) %out, ptr a ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; ; GFX90a-PRELOAD-2-LABEL: byref_kernel_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100 +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s1 -; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] ; GFX90a-PRELOAD-2-NEXT: s_waitcnt vmcnt(0) -; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v2, s[6:7] +; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v2, s[2:3] ; GFX90a-PRELOAD-2-NEXT: s_waitcnt vmcnt(0) ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; ; GFX90a-PRELOAD-8-LABEL: byref_kernel_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100 +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s1 -; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] ; GFX90a-PRELOAD-8-NEXT: s_waitcnt vmcnt(0) -; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v2, s[6:7] +; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v2, s[2:3] ; GFX90a-PRELOAD-8-NEXT: s_waitcnt vmcnt(0) ; GFX90a-PRELOAD-8-NEXT: s_endpgm %in = load i32, ptr addrspace(4) %in.byref @@ -616,43 +624,41 @@ define amdgpu_kernel void @v8i32_kernel_preload_arg(ptr addrspace(1) nocapture % ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: v8i32_kernel_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x20 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s8 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s9 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s11 -; GFX940-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1 +; GFX940-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_nop 1 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s4 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s5 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s6 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s7 -; GFX940-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; ; GFX940-PRELOAD-8-LABEL: v8i32_kernel_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x20 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s8 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s9 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s11 -; GFX940-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1 +; GFX940-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_nop 1 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s4 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s5 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s6 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s7 -; GFX940-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; ; GFX90a-NO-PRELOAD-LABEL: v8i32_kernel_preload_arg: @@ -675,43 +681,41 @@ define amdgpu_kernel void @v8i32_kernel_preload_arg(ptr addrspace(1) nocapture % ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; ; GFX90a-PRELOAD-2-LABEL: v8i32_kernel_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v4, 0 ; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s12 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s13 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s14 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s15 -; GFX90a-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16 +; GFX90a-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 ; GFX90a-PRELOAD-2-NEXT: s_nop 0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s8 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s9 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s10 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s11 -; GFX90a-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GFX90a-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; ; GFX90a-PRELOAD-8-LABEL: v8i32_kernel_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v4, 0 ; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s12 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s13 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s14 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s15 -; GFX90a-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16 +; GFX90a-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 ; GFX90a-PRELOAD-8-NEXT: s_nop 0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s8 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s9 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s10 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s11 -; GFX90a-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GFX90a-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX90a-PRELOAD-8-NEXT: s_endpgm store <8 x i32> %in, ptr addrspace(1) %out, align 4 ret void @@ -730,25 +734,25 @@ define amdgpu_kernel void @v3i16_kernel_preload_arg(ptr addrspace(1) nocapture % ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: v3i16_kernel_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-PRELOAD-2-NEXT: global_store_short v0, v1, s[2:3] offset:4 sc0 sc1 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s2 +; GFX940-PRELOAD-2-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; ; GFX940-PRELOAD-8-LABEL: v3i16_kernel_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-PRELOAD-8-NEXT: global_store_short v0, v1, s[2:3] offset:4 sc0 sc1 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s2 +; GFX940-PRELOAD-8-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; ; GFX90a-NO-PRELOAD-LABEL: v3i16_kernel_preload_arg: @@ -763,25 +767,25 @@ define amdgpu_kernel void @v3i16_kernel_preload_arg(ptr addrspace(1) nocapture % ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; ; GFX90a-PRELOAD-2-LABEL: v3i16_kernel_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s9 -; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[6:7] offset:4 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8 -; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s3 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s2 +; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[0:1] offset:4 +; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v2, s[0:1] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; ; GFX90a-PRELOAD-8-LABEL: v3i16_kernel_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s9 -; GFX90a-PRELOAD-8-NEXT: global_store_short v0, v1, s[6:7] offset:4 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s8 -; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s3 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s2 +; GFX90a-PRELOAD-8-NEXT: global_store_short v0, v1, s[0:1] offset:4 +; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v2, s[0:1] ; GFX90a-PRELOAD-8-NEXT: s_endpgm store <3 x i16> %in, ptr addrspace(1) %out, align 4 ret void @@ -801,24 +805,26 @@ define amdgpu_kernel void @v3i32_kernel_preload_arg(ptr addrspace(1) nocapture % ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: v3i32_kernel_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-2-NEXT: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s6 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s7 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s8 +; GFX940-PRELOAD-2: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s6 ; GFX940-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; ; GFX940-PRELOAD-8-LABEL: v3i32_kernel_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-8-NEXT: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s6 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s7 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s8 +; GFX940-PRELOAD-8: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s6 ; GFX940-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; @@ -835,24 +841,26 @@ define amdgpu_kernel void @v3i32_kernel_preload_arg(ptr addrspace(1) nocapture % ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; ; GFX90a-PRELOAD-2-LABEL: v3i32_kernel_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-2-NEXT: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s10 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s11 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s12 +; GFX90a-PRELOAD-2: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 +; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s0 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s1 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s2 ; GFX90a-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; ; GFX90a-PRELOAD-8-LABEL: v3i32_kernel_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-8-NEXT: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s10 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s11 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s12 +; GFX90a-PRELOAD-8: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 +; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s0 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s1 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s2 ; GFX90a-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] ; GFX90a-PRELOAD-8-NEXT: s_endpgm store <3 x i32> %in, ptr addrspace(1) %out, align 4 @@ -873,24 +881,26 @@ define amdgpu_kernel void @v3f32_kernel_preload_arg(ptr addrspace(1) nocapture % ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: v3f32_kernel_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s6 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s7 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s8 +; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s6 ; GFX940-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; ; GFX940-PRELOAD-8-LABEL: v3f32_kernel_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s6 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s7 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s8 +; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s6 ; GFX940-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; @@ -907,24 +917,26 @@ define amdgpu_kernel void @v3f32_kernel_preload_arg(ptr addrspace(1) nocapture % ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; ; GFX90a-PRELOAD-2-LABEL: v3f32_kernel_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s10 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s11 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s12 +; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s0 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s1 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s2 ; GFX90a-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; ; GFX90a-PRELOAD-8-LABEL: v3f32_kernel_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s10 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s11 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s12 +; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s0 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s1 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s2 ; GFX90a-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] ; GFX90a-PRELOAD-8-NEXT: s_endpgm store <3 x float> %in, ptr addrspace(1) %out, align 4 @@ -944,39 +956,25 @@ define amdgpu_kernel void @v5i8_kernel_preload_arg(ptr addrspace(1) nocapture %o ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: v5i8_kernel_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-2-NEXT: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 8 -; GFX940-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 -; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 24 -; GFX940-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v1, 8, s0 -; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 16 -; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s5 -; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, 0 -; GFX940-PRELOAD-2-NEXT: global_store_byte v1, v2, s[2:3] offset:4 sc0 sc1 -; GFX940-PRELOAD-2-NEXT: global_store_dword v1, v0, s[2:3] sc0 sc1 +; GFX940-PRELOAD-2: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s2 +; GFX940-PRELOAD-2-NEXT: global_store_byte v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; ; GFX940-PRELOAD-8-LABEL: v5i8_kernel_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-8-NEXT: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 8 -; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 -; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 24 -; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v1, 8, s0 -; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 16 -; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s5 -; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, 0 -; GFX940-PRELOAD-8-NEXT: global_store_byte v1, v2, s[2:3] offset:4 sc0 sc1 -; GFX940-PRELOAD-8-NEXT: global_store_dword v1, v0, s[2:3] sc0 sc1 +; GFX940-PRELOAD-8: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s2 +; GFX940-PRELOAD-8-NEXT: global_store_byte v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; ; GFX90a-NO-PRELOAD-LABEL: v5i8_kernel_preload_arg: @@ -991,39 +989,25 @@ define amdgpu_kernel void @v5i8_kernel_preload_arg(ptr addrspace(1) nocapture %o ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; ; GFX90a-PRELOAD-2-LABEL: v5i8_kernel_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-2-NEXT: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 8 -; GFX90a-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 -; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 24 -; GFX90a-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v1, 8, s0 -; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 16 -; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, 0 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s9 -; GFX90a-PRELOAD-2-NEXT: global_store_byte v1, v2, s[6:7] offset:4 -; GFX90a-PRELOAD-2-NEXT: global_store_dword v1, v0, s[6:7] +; GFX90a-PRELOAD-2: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s3 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s2 +; GFX90a-PRELOAD-2-NEXT: global_store_byte v0, v1, s[0:1] offset:4 +; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v2, s[0:1] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; ; GFX90a-PRELOAD-8-LABEL: v5i8_kernel_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-8-NEXT: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 8 -; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 -; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 24 -; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v1, 8, s0 -; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 16 -; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, 0 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s9 -; GFX90a-PRELOAD-8-NEXT: global_store_byte v1, v2, s[6:7] offset:4 -; GFX90a-PRELOAD-8-NEXT: global_store_dword v1, v0, s[6:7] +; GFX90a-PRELOAD-8: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s3 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s2 +; GFX90a-PRELOAD-8-NEXT: global_store_byte v0, v1, s[0:1] offset:4 +; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v2, s[0:1] ; GFX90a-PRELOAD-8-NEXT: s_endpgm store <5 x i8> %in, ptr addrspace(1) %out, align 4 ret void @@ -1053,49 +1037,47 @@ define amdgpu_kernel void @v5f64_kernel_preload_arg(ptr addrspace(1) nocapture % ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: v5f64_kernel_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-2-NEXT: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x60 +; GFX940-PRELOAD-2: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x60 ; GFX940-PRELOAD-2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x40 +; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x0 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v4, 0 ; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: v_mov_b64_e32 v[2:3], s[12:13] +; GFX940-PRELOAD-2-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s8 -; GFX940-PRELOAD-2-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 sc0 sc1 +; GFX940-PRELOAD-2-NEXT: global_store_dwordx2 v4, v[2:3], s[12:13] offset:32 sc0 sc1 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s9 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s11 -; GFX940-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1 +; GFX940-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16 sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_nop 1 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s4 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s5 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s6 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s7 -; GFX940-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; ; GFX940-PRELOAD-8-LABEL: v5f64_kernel_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-8-NEXT: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x60 +; GFX940-PRELOAD-8: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x60 ; GFX940-PRELOAD-8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x40 +; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x0 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v4, 0 ; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-8-NEXT: v_mov_b64_e32 v[2:3], s[12:13] +; GFX940-PRELOAD-8-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s8 -; GFX940-PRELOAD-8-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 sc0 sc1 +; GFX940-PRELOAD-8-NEXT: global_store_dwordx2 v4, v[2:3], s[12:13] offset:32 sc0 sc1 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s9 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s11 -; GFX940-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1 +; GFX940-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16 sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_nop 1 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s4 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s5 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s6 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s7 -; GFX940-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; ; GFX90a-NO-PRELOAD-LABEL: v5f64_kernel_preload_arg: @@ -1121,49 +1103,47 @@ define amdgpu_kernel void @v5f64_kernel_preload_arg(ptr addrspace(1) nocapture % ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; ; GFX90a-PRELOAD-2-LABEL: v5f64_kernel_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60 ; GFX90a-PRELOAD-2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v4, 0 ; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-PRELOAD-2-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s12 -; GFX90a-PRELOAD-2-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7] offset:32 +; GFX90a-PRELOAD-2-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s13 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s14 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s15 -; GFX90a-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16 +; GFX90a-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 ; GFX90a-PRELOAD-2-NEXT: s_nop 0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s8 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s9 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s10 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s11 -; GFX90a-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GFX90a-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; ; GFX90a-PRELOAD-8-LABEL: v5f64_kernel_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60 ; GFX90a-PRELOAD-8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v4, 0 ; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-PRELOAD-8-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s12 -; GFX90a-PRELOAD-8-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7] offset:32 +; GFX90a-PRELOAD-8-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s13 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s14 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s15 -; GFX90a-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16 +; GFX90a-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 ; GFX90a-PRELOAD-8-NEXT: s_nop 0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s8 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s9 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s10 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s11 -; GFX90a-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GFX90a-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX90a-PRELOAD-8-NEXT: s_endpgm store <5 x double> %in, ptr addrspace(1) %out, align 8 ret void @@ -1180,53 +1160,21 @@ define amdgpu_kernel void @v8i8_kernel_preload_arg(ptr addrspace(1) %out, <8 x i ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: v8i8_kernel_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-2-NEXT: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s5, 8 -; GFX940-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 -; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s5, 24 -; GFX940-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v1, 8, s0 -; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s5, 16 -; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 8 -; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX940-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 -; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 24 -; GFX940-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v2, 8, s0 -; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 16 -; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX940-PRELOAD-2: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-2-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; ; GFX940-PRELOAD-8-LABEL: v8i8_kernel_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-8-NEXT: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s5, 8 -; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 -; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s5, 24 -; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v1, 8, s0 -; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s5, 16 -; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 8 -; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 -; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 24 -; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v2, 8, s0 -; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 16 -; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX940-PRELOAD-8: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-8-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; ; GFX90a-NO-PRELOAD-LABEL: v8i8_kernel_preload_arg: @@ -1239,51 +1187,21 @@ define amdgpu_kernel void @v8i8_kernel_preload_arg(ptr addrspace(1) %out, <8 x i ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; ; GFX90a-PRELOAD-2-LABEL: v8i8_kernel_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-2-NEXT: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s9, 8 -; GFX90a-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 -; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s9, 24 -; GFX90a-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v1, 8, s0 -; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s9, 16 -; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v0, s9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 8 -; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX90a-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 -; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 24 -; GFX90a-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v2, 8, s0 -; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 16 -; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX90a-PRELOAD-2: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0 -; GFX90a-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-2-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90a-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; ; GFX90a-PRELOAD-8-LABEL: v8i8_kernel_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-8-NEXT: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s9, 8 -; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 -; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s9, 24 -; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v1, 8, s0 -; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s9, 16 -; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 8 -; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 -; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 24 -; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v2, 8, s0 -; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 16 -; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX90a-PRELOAD-8: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0 -; GFX90a-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-8-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90a-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX90a-PRELOAD-8-NEXT: s_endpgm store <8 x i8> %in, ptr addrspace(1) %out ret void @@ -1301,21 +1219,23 @@ define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) %out, i64 %a) ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: i64_kernel_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-PRELOAD-2-NEXT: v_mov_b64_e32 v[0:1], s[4:5] -; GFX940-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; ; GFX940-PRELOAD-8-LABEL: i64_kernel_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-PRELOAD-8-NEXT: v_mov_b64_e32 v[0:1], s[4:5] -; GFX940-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; ; GFX90a-NO-PRELOAD-LABEL: i64_kernel_preload_arg: @@ -1329,21 +1249,23 @@ define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) %out, i64 %a) ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; ; GFX90a-PRELOAD-2-LABEL: i64_kernel_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0 -; GFX90a-PRELOAD-2-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1] -; GFX90a-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s2 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s3 +; GFX90a-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; ; GFX90a-PRELOAD-8-LABEL: i64_kernel_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0 -; GFX90a-PRELOAD-8-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1] -; GFX90a-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s2 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s3 +; GFX90a-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX90a-PRELOAD-8-NEXT: s_endpgm store i64 %a, ptr addrspace(1) %out, align 8 ret void @@ -1361,21 +1283,23 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) %out, double ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: f64_kernel_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-PRELOAD-2-NEXT: v_mov_b64_e32 v[0:1], s[4:5] -; GFX940-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; ; GFX940-PRELOAD-8-LABEL: f64_kernel_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-PRELOAD-8-NEXT: v_mov_b64_e32 v[0:1], s[4:5] -; GFX940-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; ; GFX90a-NO-PRELOAD-LABEL: f64_kernel_preload_arg: @@ -1389,21 +1313,23 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) %out, double ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; ; GFX90a-PRELOAD-2-LABEL: f64_kernel_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0 -; GFX90a-PRELOAD-2-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1] -; GFX90a-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s2 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s3 +; GFX90a-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; ; GFX90a-PRELOAD-8-LABEL: f64_kernel_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0 -; GFX90a-PRELOAD-8-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1] -; GFX90a-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s2 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s3 +; GFX90a-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX90a-PRELOAD-8-NEXT: s_endpgm store double %in, ptr addrspace(1) %out ret void @@ -1421,19 +1347,21 @@ define amdgpu_kernel void @half_kernel_preload_arg(ptr addrspace(1) %out, half % ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: half_kernel_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4 ; GFX940-PRELOAD-2-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; ; GFX940-PRELOAD-8-LABEL: half_kernel_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4 ; GFX940-PRELOAD-8-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm @@ -1449,21 +1377,23 @@ define amdgpu_kernel void @half_kernel_preload_arg(ptr addrspace(1) %out, half % ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; ; GFX90a-PRELOAD-2-LABEL: half_kernel_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8 -; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[6:7] +; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[0:1] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; ; GFX90a-PRELOAD-8-LABEL: half_kernel_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s8 -; GFX90a-PRELOAD-8-NEXT: global_store_short v0, v1, s[6:7] +; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-8-NEXT: global_store_short v0, v1, s[0:1] ; GFX90a-PRELOAD-8-NEXT: s_endpgm store half %in, ptr addrspace(1) %out ret void @@ -1481,19 +1411,21 @@ define amdgpu_kernel void @bfloat_kernel_preload_arg(ptr addrspace(1) %out, bflo ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: bfloat_kernel_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4 ; GFX940-PRELOAD-2-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; ; GFX940-PRELOAD-8-LABEL: bfloat_kernel_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4 ; GFX940-PRELOAD-8-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm @@ -1509,21 +1441,23 @@ define amdgpu_kernel void @bfloat_kernel_preload_arg(ptr addrspace(1) %out, bflo ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; ; GFX90a-PRELOAD-2-LABEL: bfloat_kernel_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8 -; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[6:7] +; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[0:1] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; ; GFX90a-PRELOAD-8-LABEL: bfloat_kernel_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s8 -; GFX90a-PRELOAD-8-NEXT: global_store_short v0, v1, s[6:7] +; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-8-NEXT: global_store_short v0, v1, s[0:1] ; GFX90a-PRELOAD-8-NEXT: s_endpgm store bfloat %in, ptr addrspace(1) %out ret void @@ -1541,19 +1475,21 @@ define amdgpu_kernel void @v2bfloat_kernel_preload_arg(ptr addrspace(1) %out, <2 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: v2bfloat_kernel_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4 ; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; ; GFX940-PRELOAD-8-LABEL: v2bfloat_kernel_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4 ; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm @@ -1569,21 +1505,23 @@ define amdgpu_kernel void @v2bfloat_kernel_preload_arg(ptr addrspace(1) %out, <2 ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; ; GFX90a-PRELOAD-2-LABEL: v2bfloat_kernel_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8 -; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; ; GFX90a-PRELOAD-8-LABEL: v2bfloat_kernel_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s8 -; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90a-PRELOAD-8-NEXT: s_endpgm store <2 x bfloat> %in, ptr addrspace(1) %out ret void @@ -1602,25 +1540,25 @@ define amdgpu_kernel void @v3bfloat_kernel_preload_arg(ptr addrspace(1) %out, <3 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: v3bfloat_kernel_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-PRELOAD-2-NEXT: global_store_short v0, v1, s[2:3] offset:4 sc0 sc1 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s2 +; GFX940-PRELOAD-2-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; ; GFX940-PRELOAD-8-LABEL: v3bfloat_kernel_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-PRELOAD-8-NEXT: global_store_short v0, v1, s[2:3] offset:4 sc0 sc1 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s2 +; GFX940-PRELOAD-8-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; ; GFX90a-NO-PRELOAD-LABEL: v3bfloat_kernel_preload_arg: @@ -1635,25 +1573,25 @@ define amdgpu_kernel void @v3bfloat_kernel_preload_arg(ptr addrspace(1) %out, <3 ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; ; GFX90a-PRELOAD-2-LABEL: v3bfloat_kernel_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s9 -; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[6:7] offset:4 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8 -; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s3 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s2 +; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[0:1] offset:4 +; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v2, s[0:1] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; ; GFX90a-PRELOAD-8-LABEL: v3bfloat_kernel_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s9 -; GFX90a-PRELOAD-8-NEXT: global_store_short v0, v1, s[6:7] offset:4 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s8 -; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s3 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s2 +; GFX90a-PRELOAD-8-NEXT: global_store_short v0, v1, s[0:1] offset:4 +; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v2, s[0:1] ; GFX90a-PRELOAD-8-NEXT: s_endpgm store <3 x bfloat> %in, ptr addrspace(1) %out ret void @@ -1673,24 +1611,26 @@ define amdgpu_kernel void @v6bfloat_kernel_preload_arg(ptr addrspace(1) %out, <6 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: v6bfloat_kernel_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-2-NEXT: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s6 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s7 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s8 +; GFX940-PRELOAD-2: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s6 ; GFX940-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; ; GFX940-PRELOAD-8-LABEL: v6bfloat_kernel_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-8-NEXT: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s6 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s7 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s8 +; GFX940-PRELOAD-8: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s6 ; GFX940-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; @@ -1707,24 +1647,26 @@ define amdgpu_kernel void @v6bfloat_kernel_preload_arg(ptr addrspace(1) %out, <6 ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; ; GFX90a-PRELOAD-2-LABEL: v6bfloat_kernel_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-2-NEXT: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s10 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s11 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s12 +; GFX90a-PRELOAD-2: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 +; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s0 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s1 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s2 ; GFX90a-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; ; GFX90a-PRELOAD-8-LABEL: v6bfloat_kernel_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-8-NEXT: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s10 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s11 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s12 +; GFX90a-PRELOAD-8: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 +; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s0 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s1 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s2 ; GFX90a-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] ; GFX90a-PRELOAD-8-NEXT: s_endpgm store <6 x bfloat> %in, ptr addrspace(1) %out @@ -1751,36 +1693,39 @@ define amdgpu_kernel void @half_v7bfloat_kernel_preload_arg(ptr addrspace(1) %ou ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: half_v7bfloat_kernel_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-2-NEXT: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x10 -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 +; GFX940-PRELOAD-2: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dword s10, s[0:1], 0x8 +; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x20 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-PRELOAD-2-NEXT: global_store_short v3, v0, s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s11 -; GFX940-PRELOAD-2-NEXT: global_store_short v3, v0, s[6:7] offset:12 sc0 sc1 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s10 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s8 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s9 -; GFX940-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s10 +; GFX940-PRELOAD-2-NEXT: global_store_short v3, v0, s[2:3] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s7 +; GFX940-PRELOAD-2-NEXT: global_store_short v3, v0, s[8:9] offset:12 sc0 sc1 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; ; GFX940-PRELOAD-8-LABEL: half_v7bfloat_kernel_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_load_dword s10, s[0:1], 0x8 +; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x20 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s10 ; GFX940-PRELOAD-8-NEXT: global_store_short v3, v0, s[2:3] sc0 sc1 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s9 -; GFX940-PRELOAD-8-NEXT: global_store_short v3, v0, s[10:11] offset:12 sc0 sc1 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s8 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s6 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s7 -; GFX940-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[10:11] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s7 +; GFX940-PRELOAD-8-NEXT: global_store_short v3, v0, s[8:9] offset:12 sc0 sc1 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; ; GFX90a-NO-PRELOAD-LABEL: half_v7bfloat_kernel_preload_arg: @@ -1802,38 +1747,39 @@ define amdgpu_kernel void @half_v7bfloat_kernel_preload_arg(ptr addrspace(1) %ou ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; ; GFX90a-PRELOAD-2-LABEL: half_v7bfloat_kernel_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_load_dword s10, s[4:5], 0x8 +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x20 +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x20 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s8 -; GFX90a-PRELOAD-2-NEXT: global_store_short v3, v0, s[6:7] ; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s10 +; GFX90a-PRELOAD-2-NEXT: global_store_short v3, v0, s[6:7] ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s3 -; GFX90a-PRELOAD-2-NEXT: global_store_short v3, v0, s[10:11] offset:12 +; GFX90a-PRELOAD-2-NEXT: global_store_short v3, v0, s[8:9] offset:12 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s2 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s1 -; GFX90a-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[10:11] +; GFX90a-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; ; GFX90a-PRELOAD-8-LABEL: half_v7bfloat_kernel_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-8-NEXT: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x20 +; GFX90a-PRELOAD-8: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_load_dword s10, s[4:5], 0x8 +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x20 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s8 -; GFX90a-PRELOAD-8-NEXT: global_store_short v3, v0, s[6:7] -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s13 ; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-8-NEXT: global_store_short v3, v0, s[0:1] offset:12 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s12 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s10 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s11 -; GFX90a-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX90a-PRELOAD-8-NEXT: global_store_short v3, v0, s[6:7] +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s3 +; GFX90a-PRELOAD-8-NEXT: global_store_short v3, v0, s[8:9] offset:12 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s2 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s0 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s1 +; GFX90a-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9] ; GFX90a-PRELOAD-8-NEXT: s_endpgm store half %in, ptr addrspace(1) %out store <7 x bfloat> %in2, ptr addrspace(1) %out2 @@ -1853,21 +1799,23 @@ define amdgpu_kernel void @i1_kernel_preload_arg(ptr addrspace(1) %out, i1 %in) ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: i1_kernel_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-2-NEXT: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_and_b32 s0, s4, 1 +; GFX940-PRELOAD-2: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-2-NEXT: s_and_b32 s0, s4, 1 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 ; GFX940-PRELOAD-2-NEXT: global_store_byte v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; ; GFX940-PRELOAD-8-LABEL: i1_kernel_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-8-NEXT: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_and_b32 s0, s4, 1 +; GFX940-PRELOAD-8: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-8-NEXT: s_and_b32 s0, s4, 1 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 ; GFX940-PRELOAD-8-NEXT: global_store_byte v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm @@ -1884,23 +1832,25 @@ define amdgpu_kernel void @i1_kernel_preload_arg(ptr addrspace(1) %out, i1 %in) ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; ; GFX90a-PRELOAD-2-LABEL: i1_kernel_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-2-NEXT: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_and_b32 s0, s8, 1 +; GFX90a-PRELOAD-2: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-PRELOAD-2-NEXT: global_store_byte v0, v1, s[6:7] +; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-2-NEXT: s_and_b32 s2, s2, 1 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-2-NEXT: global_store_byte v0, v1, s[0:1] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; ; GFX90a-PRELOAD-8-LABEL: i1_kernel_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-8-NEXT: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_and_b32 s0, s8, 1 +; GFX90a-PRELOAD-8: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-PRELOAD-8-NEXT: global_store_byte v0, v1, s[6:7] +; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-8-NEXT: s_and_b32 s2, s2, 1 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-8-NEXT: global_store_byte v0, v1, s[0:1] ; GFX90a-PRELOAD-8-NEXT: s_endpgm store i1 %in, ptr addrspace(1) %out ret void @@ -1919,26 +1869,24 @@ define amdgpu_kernel void @fp128_kernel_preload_arg(ptr addrspace(1) %out, fp128 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: fp128_kernel_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s6 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s7 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s8 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s9 +; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-2-NEXT: v_mov_b64_e32 v[0:1], s[4:5] +; GFX940-PRELOAD-2-NEXT: v_mov_b64_e32 v[2:3], s[6:7] ; GFX940-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; ; GFX940-PRELOAD-8-LABEL: fp128_kernel_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s6 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s7 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s8 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s9 +; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-8-NEXT: v_mov_b64_e32 v[0:1], s[4:5] +; GFX940-PRELOAD-8-NEXT: v_mov_b64_e32 v[2:3], s[6:7] ; GFX940-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; @@ -1954,26 +1902,24 @@ define amdgpu_kernel void @fp128_kernel_preload_arg(ptr addrspace(1) %out, fp128 ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; ; GFX90a-PRELOAD-2-LABEL: fp128_kernel_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v4, 0 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s10 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s11 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s12 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s13 +; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-2-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90a-PRELOAD-2-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90a-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; ; GFX90a-PRELOAD-8-LABEL: fp128_kernel_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v4, 0 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s10 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s11 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s12 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s13 +; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-8-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90a-PRELOAD-8-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90a-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX90a-PRELOAD-8-NEXT: s_endpgm store fp128 %in, ptr addrspace(1) %out @@ -1994,47 +1940,27 @@ define amdgpu_kernel void @v7i8_kernel_preload_arg(ptr addrspace(1) %out, <7 x i ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: v7i8_kernel_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-2-NEXT: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 8 -; GFX940-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 -; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 24 -; GFX940-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v1, 8, s0 -; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 16 -; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s5, 8 -; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX940-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v1, 8, s0 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s5 -; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX940-PRELOAD-2-NEXT: global_store_byte_d16_hi v2, v3, s[2:3] offset:6 sc0 sc1 -; GFX940-PRELOAD-2-NEXT: global_store_short v2, v1, s[2:3] offset:4 sc0 sc1 -; GFX940-PRELOAD-2-NEXT: global_store_dword v2, v0, s[2:3] sc0 sc1 +; GFX940-PRELOAD-2: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s2 +; GFX940-PRELOAD-2-NEXT: global_store_byte_d16_hi v0, v1, s[0:1] offset:6 sc0 sc1 +; GFX940-PRELOAD-2-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; ; GFX940-PRELOAD-8-LABEL: v7i8_kernel_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-8-NEXT: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 8 -; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 -; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 24 -; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v1, 8, s0 -; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 16 -; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s5, 8 -; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v1, 8, s0 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s5 -; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX940-PRELOAD-8-NEXT: global_store_byte_d16_hi v2, v3, s[2:3] offset:6 sc0 sc1 -; GFX940-PRELOAD-8-NEXT: global_store_short v2, v1, s[2:3] offset:4 sc0 sc1 -; GFX940-PRELOAD-8-NEXT: global_store_dword v2, v0, s[2:3] sc0 sc1 +; GFX940-PRELOAD-8: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s2 +; GFX940-PRELOAD-8-NEXT: global_store_byte_d16_hi v0, v1, s[0:1] offset:6 sc0 sc1 +; GFX940-PRELOAD-8-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; ; GFX90a-NO-PRELOAD-LABEL: v7i8_kernel_preload_arg: @@ -2050,47 +1976,27 @@ define amdgpu_kernel void @v7i8_kernel_preload_arg(ptr addrspace(1) %out, <7 x i ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; ; GFX90a-PRELOAD-2-LABEL: v7i8_kernel_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-2-NEXT: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 8 -; GFX90a-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 -; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 24 -; GFX90a-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v1, 8, s0 -; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 16 -; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s9, 8 -; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX90a-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v1, 8, s0 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s9 -; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v1, s9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX90a-PRELOAD-2-NEXT: global_store_byte_d16_hi v2, v3, s[6:7] offset:6 -; GFX90a-PRELOAD-2-NEXT: global_store_short v2, v1, s[6:7] offset:4 -; GFX90a-PRELOAD-2-NEXT: global_store_dword v2, v0, s[6:7] +; GFX90a-PRELOAD-2: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s3 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s2 +; GFX90a-PRELOAD-2-NEXT: global_store_byte_d16_hi v0, v1, s[0:1] offset:6 +; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[0:1] offset:4 +; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v2, s[0:1] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; ; GFX90a-PRELOAD-8-LABEL: v7i8_kernel_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-8-NEXT: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 8 -; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 -; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 24 -; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v1, 8, s0 -; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 16 -; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s9, 8 -; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v1, 8, s0 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s9 -; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v1, s9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX90a-PRELOAD-8-NEXT: global_store_byte_d16_hi v2, v3, s[6:7] offset:6 -; GFX90a-PRELOAD-8-NEXT: global_store_short v2, v1, s[6:7] offset:4 -; GFX90a-PRELOAD-8-NEXT: global_store_dword v2, v0, s[6:7] +; GFX90a-PRELOAD-8: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s3 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s2 +; GFX90a-PRELOAD-8-NEXT: global_store_byte_d16_hi v0, v1, s[0:1] offset:6 +; GFX90a-PRELOAD-8-NEXT: global_store_short v0, v1, s[0:1] offset:4 +; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v2, s[0:1] ; GFX90a-PRELOAD-8-NEXT: s_endpgm store <7 x i8> %in, ptr addrspace(1) %out ret void @@ -2112,28 +2018,30 @@ define amdgpu_kernel void @v7half_kernel_preload_arg(ptr addrspace(1) %out, <7 x ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: v7half_kernel_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s9 -; GFX940-PRELOAD-2-NEXT: global_store_short v3, v0, s[2:3] offset:12 sc0 sc1 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s8 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s7 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-PRELOAD-2-NEXT: global_store_short v3, v1, s[2:3] offset:12 sc0 sc1 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s5 ; GFX940-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; ; GFX940-PRELOAD-8-LABEL: v7half_kernel_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s9 -; GFX940-PRELOAD-8-NEXT: global_store_short v3, v0, s[2:3] offset:12 sc0 sc1 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s8 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s7 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-PRELOAD-8-NEXT: global_store_short v3, v1, s[2:3] offset:12 sc0 sc1 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s5 ; GFX940-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; @@ -2152,28 +2060,30 @@ define amdgpu_kernel void @v7half_kernel_preload_arg(ptr addrspace(1) %out, <7 x ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; ; GFX90a-PRELOAD-2-LABEL: v7half_kernel_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s13 -; GFX90a-PRELOAD-2-NEXT: global_store_short v3, v0, s[6:7] offset:12 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s12 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s10 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s11 +; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s3 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s2 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s0 +; GFX90a-PRELOAD-2-NEXT: global_store_short v3, v1, s[6:7] offset:12 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s1 ; GFX90a-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; ; GFX90a-PRELOAD-8-LABEL: v7half_kernel_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s13 -; GFX90a-PRELOAD-8-NEXT: global_store_short v3, v0, s[6:7] offset:12 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s12 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s10 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s11 +; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s3 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s2 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s0 +; GFX90a-PRELOAD-8-NEXT: global_store_short v3, v1, s[6:7] offset:12 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s1 ; GFX90a-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] ; GFX90a-PRELOAD-8-NEXT: s_endpgm store <7 x half> %in, ptr addrspace(1) %out @@ -2195,28 +2105,27 @@ define amdgpu_kernel void @i16_i32_kernel_preload_arg(ptr addrspace(1) %out, i16 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: i16_i32_kernel_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-2-NEXT: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dword s5, s[0:1], 0xc -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x10 +; GFX940-PRELOAD-2: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 +; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x10 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-PRELOAD-2-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s7 +; GFX940-PRELOAD-2-NEXT: global_store_short v0, v1, s[4:5] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v2, s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; ; GFX940-PRELOAD-8-LABEL: i16_i32_kernel_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 +; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x10 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-PRELOAD-8-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s7 +; GFX940-PRELOAD-8-NEXT: global_store_short v0, v1, s[4:5] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v2, s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; ; GFX90a-NO-PRELOAD-LABEL: i16_i32_kernel_preload_arg: @@ -2232,28 +2141,27 @@ define amdgpu_kernel void @i16_i32_kernel_preload_arg(ptr addrspace(1) %out, i16 ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; ; GFX90a-PRELOAD-2-LABEL: i16_i32_kernel_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-2-NEXT: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0xc -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 +; GFX90a-PRELOAD-2: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8 -; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[6:7] ; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s3 +; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[0:1] +; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v2, s[6:7] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; ; GFX90a-PRELOAD-8-LABEL: i16_i32_kernel_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s8 -; GFX90a-PRELOAD-8-NEXT: global_store_short v0, v1, s[6:7] -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s9 -; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[10:11] +; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s3 +; GFX90a-PRELOAD-8-NEXT: global_store_short v0, v1, s[0:1] +; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v2, s[6:7] ; GFX90a-PRELOAD-8-NEXT: s_endpgm store i16 %in, ptr addrspace(1) %out store i32 %in2, ptr addrspace(1) %out2 @@ -2279,32 +2187,37 @@ define amdgpu_kernel void @i16_v3i32_kernel_preload_arg(ptr addrspace(1) %out, i ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: i16_v3i32_kernel_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-2-NEXT: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x10 +; GFX940-PRELOAD-2: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-2-NEXT: s_load_dword s7, s[0:1], 0x8 +; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x20 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x20 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v4, s4 ; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s8 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s9 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s10 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v4, s7 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s6 ; GFX940-PRELOAD-2-NEXT: global_store_short v3, v4, s[2:3] sc0 sc1 -; GFX940-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; ; GFX940-PRELOAD-8-LABEL: i16_v3i32_kernel_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-8-NEXT: s_load_dword s7, s[0:1], 0x8 +; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x20 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v4, s4 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s6 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s7 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s8 +; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v4, s7 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s6 ; GFX940-PRELOAD-8-NEXT: global_store_short v3, v4, s[2:3] sc0 sc1 -; GFX940-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[10:11] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; ; GFX90a-NO-PRELOAD-LABEL: i16_v3i32_kernel_preload_arg: @@ -2325,34 +2238,37 @@ define amdgpu_kernel void @i16_v3i32_kernel_preload_arg(ptr addrspace(1) %out, i ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; ; GFX90a-PRELOAD-2-LABEL: i16_v3i32_kernel_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-2-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x20 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x20 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v4, s8 ; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v4, s3 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s1 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s2 ; GFX90a-PRELOAD-2-NEXT: global_store_short v3, v4, s[6:7] -; GFX90a-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[4:5] +; GFX90a-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; ; GFX90a-PRELOAD-8-LABEL: i16_v3i32_kernel_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-8-NEXT: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x20 +; GFX90a-PRELOAD-8: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-8-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x20 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v4, s8 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s10 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s11 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s12 -; GFX90a-PRELOAD-8-NEXT: global_store_short v3, v4, s[6:7] ; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v4, s3 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s0 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s1 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s2 +; GFX90a-PRELOAD-8-NEXT: global_store_short v3, v4, s[6:7] +; GFX90a-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9] ; GFX90a-PRELOAD-8-NEXT: s_endpgm store i16 %in, ptr addrspace(1) %out store <3 x i32> %in2, ptr addrspace(1) %out2 @@ -2373,27 +2289,27 @@ define amdgpu_kernel void @i16_i16_kernel_preload_arg(ptr addrspace(1) %out, i16 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: i16_i16_kernel_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-2-NEXT: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dword s5, s[0:1], 0x8 -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x10 +; GFX940-PRELOAD-2: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dword s6, s[0:1], 0x8 +; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x10 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-PRELOAD-2-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-PRELOAD-2-NEXT: global_store_short_d16_hi v0, v1, s[6:7] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-PRELOAD-2-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: global_store_short_d16_hi v0, v1, s[4:5] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; ; GFX940-PRELOAD-8-LABEL: i16_i16_kernel_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_load_dword s6, s[0:1], 0x8 +; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x10 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s6 ; GFX940-PRELOAD-8-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-8-NEXT: global_store_short_d16_hi v0, v1, s[6:7] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: global_store_short_d16_hi v0, v1, s[4:5] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; ; GFX90a-NO-PRELOAD-LABEL: i16_i16_kernel_preload_arg: @@ -2409,27 +2325,27 @@ define amdgpu_kernel void @i16_i16_kernel_preload_arg(ptr addrspace(1) %out, i16 ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; ; GFX90a-PRELOAD-2-LABEL: i16_i16_kernel_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-2-NEXT: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 +; GFX90a-PRELOAD-2: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x10 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8 -; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[6:7] ; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-2-NEXT: global_store_short_d16_hi v0, v1, s[0:1] +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s6 +; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[0:1] +; GFX90a-PRELOAD-2-NEXT: global_store_short_d16_hi v0, v1, s[2:3] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; ; GFX90a-PRELOAD-8-LABEL: i16_i16_kernel_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x10 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s8 -; GFX90a-PRELOAD-8-NEXT: global_store_short v0, v1, s[6:7] -; GFX90a-PRELOAD-8-NEXT: global_store_short_d16_hi v0, v1, s[10:11] +; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s6 +; GFX90a-PRELOAD-8-NEXT: global_store_short v0, v1, s[0:1] +; GFX90a-PRELOAD-8-NEXT: global_store_short_d16_hi v0, v1, s[2:3] ; GFX90a-PRELOAD-8-NEXT: s_endpgm store i16 %in, ptr addrspace(1) %out store i16 %in2, ptr addrspace(1) %out2 @@ -2450,31 +2366,27 @@ define amdgpu_kernel void @i16_v2i8_kernel_preload_arg(ptr addrspace(1) %out, i1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: i16_v2i8_kernel_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-2-NEXT: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dword s5, s[0:1], 0x8 -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x10 +; GFX940-PRELOAD-2: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dword s6, s[0:1], 0x8 +; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x10 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-PRELOAD-2-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-PRELOAD-2-NEXT: global_store_short_d16_hi v0, v1, s[6:7] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-PRELOAD-2-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: global_store_short_d16_hi v0, v1, s[4:5] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; ; GFX940-PRELOAD-8-LABEL: i16_v2i8_kernel_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-8-NEXT: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 24 -; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 -; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 16 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, 0 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s4 -; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX940-PRELOAD-8-NEXT: global_store_short v1, v2, s[2:3] sc0 sc1 -; GFX940-PRELOAD-8-NEXT: global_store_short v1, v0, s[6:7] sc0 sc1 +; GFX940-PRELOAD-8: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_load_dword s6, s[0:1], 0x8 +; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x10 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-PRELOAD-8-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: global_store_short_d16_hi v0, v1, s[4:5] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; ; GFX90a-NO-PRELOAD-LABEL: i16_v2i8_kernel_preload_arg: @@ -2490,31 +2402,27 @@ define amdgpu_kernel void @i16_v2i8_kernel_preload_arg(ptr addrspace(1) %out, i1 ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; ; GFX90a-PRELOAD-2-LABEL: i16_v2i8_kernel_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-2-NEXT: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 +; GFX90a-PRELOAD-2: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x10 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8 -; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[6:7] ; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-2-NEXT: global_store_short_d16_hi v0, v1, s[0:1] +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s6 +; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[0:1] +; GFX90a-PRELOAD-2-NEXT: global_store_short_d16_hi v0, v1, s[2:3] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; ; GFX90a-PRELOAD-8-LABEL: i16_v2i8_kernel_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-8-NEXT: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 24 -; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 -; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 16 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, 0 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s8 -; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX90a-PRELOAD-8-NEXT: global_store_short v1, v2, s[6:7] -; GFX90a-PRELOAD-8-NEXT: global_store_short v1, v0, s[10:11] +; GFX90a-PRELOAD-8: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x10 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s6 +; GFX90a-PRELOAD-8-NEXT: global_store_short v0, v1, s[0:1] +; GFX90a-PRELOAD-8-NEXT: global_store_short_d16_hi v0, v1, s[2:3] ; GFX90a-PRELOAD-8-NEXT: s_endpgm store i16 %in, ptr addrspace(1) %out store <2 x i8> %in2, ptr addrspace(1) %out2 diff --git a/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll b/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll index 2ce0b9eed02cb..011bb332ddd0a 100644 --- a/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll +++ b/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @buffers_dont_alias(ptr addrspace(8) noalias %a, ptr addrspace(8) noalias %b) { ; SDAG-LABEL: buffers_dont_alias: ; SDAG: ; %bb.0: -; SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; SDAG-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 ; SDAG-NEXT: s_waitcnt vmcnt(0) @@ -18,7 +18,7 @@ define amdgpu_kernel void @buffers_dont_alias(ptr addrspace(8) noalias %a, ptr a ; ; GISEL-LABEL: buffers_dont_alias: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; GISEL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 ; GISEL-NEXT: s_waitcnt vmcnt(0) @@ -50,7 +50,7 @@ define amdgpu_kernel void @buffers_dont_alias(ptr addrspace(8) noalias %a, ptr a define amdgpu_kernel void @buffers_from_flat_dont_alias(ptr noalias %a.flat, ptr noalias %b.flat) { ; SDAG-LABEL: buffers_from_flat_dont_alias: ; SDAG: ; %bb.0: -; SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; SDAG-NEXT: s_mov_b32 s3, 0 ; SDAG-NEXT: s_mov_b32 s2, 16 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -69,7 +69,7 @@ define amdgpu_kernel void @buffers_from_flat_dont_alias(ptr noalias %a.flat, ptr ; ; GISEL-LABEL: buffers_from_flat_dont_alias: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GISEL-NEXT: s_mov_b32 s3, 0 ; GISEL-NEXT: s_mov_b32 s2, 16 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -110,7 +110,7 @@ define amdgpu_kernel void @buffers_from_flat_dont_alias(ptr noalias %a.flat, ptr define amdgpu_kernel void @buffers_might_alias(ptr addrspace(8) %a, ptr addrspace(8) %b) { ; SDAG-LABEL: buffers_might_alias: ; SDAG: ; %bb.0: -; SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; SDAG-NEXT: s_waitcnt vmcnt(0) @@ -132,7 +132,7 @@ define amdgpu_kernel void @buffers_might_alias(ptr addrspace(8) %a, ptr addrspac ; ; GISEL-LABEL: buffers_might_alias: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; GISEL-NEXT: s_waitcnt vmcnt(0) @@ -173,7 +173,7 @@ define amdgpu_kernel void @buffers_might_alias(ptr addrspace(8) %a, ptr addrspac define amdgpu_kernel void @independent_offsets(ptr addrspace(8) %a) { ; SDAG-LABEL: independent_offsets: ; SDAG: ; %bb.0: -; SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SDAG-NEXT: v_mov_b32_e32 v2, 1.0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -186,7 +186,7 @@ define amdgpu_kernel void @independent_offsets(ptr addrspace(8) %a) { ; ; GISEL-LABEL: independent_offsets: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GISEL-NEXT: v_mov_b32_e32 v2, 1.0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/rotl.ll b/llvm/test/CodeGen/AMDGPU/rotl.ll index fcccd2da07f76..f692584ef92be 100644 --- a/llvm/test/CodeGen/AMDGPU/rotl.ll +++ b/llvm/test/CodeGen/AMDGPU/rotl.ll @@ -21,7 +21,7 @@ define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; SI-LABEL: rotl_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_sub_i32 s3, 32, s3 @@ -35,7 +35,7 @@ define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX8-LABEL: rotl_i32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_sub_i32 s3, 32, s3 ; GFX8-NEXT: v_mov_b32_e32 v0, s3 @@ -47,7 +47,7 @@ define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX10-LABEL: rotl_i32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_sub_i32 s0, 32, s7 @@ -57,7 +57,7 @@ define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX11-LABEL: rotl_i32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_sub_i32 s3, 32, s3 @@ -95,8 +95,8 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; ; SI-LABEL: rotl_v2i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -111,8 +111,8 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; ; GFX8-LABEL: rotl_v2i32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_sub_i32 s2, 32, s6 ; GFX8-NEXT: s_sub_i32 s3, 32, s7 @@ -128,22 +128,22 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX10-LABEL: rotl_v2i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_sub_i32 s0, 32, s7 -; GFX10-NEXT: s_sub_i32 s1, 32, s6 -; GFX10-NEXT: v_alignbit_b32 v1, s5, s5, s0 -; GFX10-NEXT: v_alignbit_b32 v0, s4, s4, s1 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX10-NEXT: s_sub_i32 s2, 32, s7 +; GFX10-NEXT: s_sub_i32 s3, 32, s6 +; GFX10-NEXT: v_alignbit_b32 v1, s5, s5, s2 +; GFX10-NEXT: v_alignbit_b32 v0, s4, s4, s3 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: rotl_v2i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_sub_i32 s2, 32, s7 @@ -188,8 +188,8 @@ define amdgpu_kernel void @rotl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; ; SI-LABEL: rotl_v4i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -210,8 +210,8 @@ define amdgpu_kernel void @rotl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; ; GFX8-LABEL: rotl_v4i32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_sub_i32 s3, 32, s9 ; GFX8-NEXT: s_sub_i32 s9, 32, s11 @@ -233,26 +233,26 @@ define amdgpu_kernel void @rotl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX10-LABEL: rotl_v4i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_sub_i32 s0, 32, s8 -; GFX10-NEXT: s_sub_i32 s1, 32, s9 +; GFX10-NEXT: s_sub_i32 s2, 32, s8 +; GFX10-NEXT: s_sub_i32 s3, 32, s9 ; GFX10-NEXT: s_sub_i32 s8, 32, s11 ; GFX10-NEXT: s_sub_i32 s9, 32, s10 ; GFX10-NEXT: v_alignbit_b32 v3, s7, s7, s8 ; GFX10-NEXT: v_alignbit_b32 v2, s6, s6, s9 -; GFX10-NEXT: v_alignbit_b32 v1, s5, s5, s1 -; GFX10-NEXT: v_alignbit_b32 v0, s4, s4, s0 -; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX10-NEXT: v_alignbit_b32 v1, s5, s5, s3 +; GFX10-NEXT: v_alignbit_b32 v0, s4, s4, s2 +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: rotl_v4i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_sub_i32 s2, 32, s8 diff --git a/llvm/test/CodeGen/AMDGPU/rotr.ll b/llvm/test/CodeGen/AMDGPU/rotr.ll index 214894092a8b0..a368aa1055b28 100644 --- a/llvm/test/CodeGen/AMDGPU/rotr.ll +++ b/llvm/test/CodeGen/AMDGPU/rotr.ll @@ -19,7 +19,7 @@ define amdgpu_kernel void @rotr_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; SI-LABEL: rotr_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -32,7 +32,7 @@ define amdgpu_kernel void @rotr_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX8-LABEL: rotr_i32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s3 ; GFX8-NEXT: v_alignbit_b32 v2, s2, s2, v0 @@ -43,7 +43,7 @@ define amdgpu_kernel void @rotr_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX10-LABEL: rotr_i32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_alignbit_b32 v1, s6, s6, s7 @@ -52,7 +52,7 @@ define amdgpu_kernel void @rotr_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX11-LABEL: rotr_i32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_alignbit_b32 v1, s2, s2, s3 @@ -84,8 +84,8 @@ define amdgpu_kernel void @rotr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; ; SI-LABEL: rotr_v2i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -98,8 +98,8 @@ define amdgpu_kernel void @rotr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; ; GFX8-LABEL: rotr_v2i32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s7 ; GFX8-NEXT: v_mov_b32_e32 v2, s6 @@ -113,20 +113,20 @@ define amdgpu_kernel void @rotr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX10-LABEL: rotr_v2i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_alignbit_b32 v1, s5, s5, s7 ; GFX10-NEXT: v_alignbit_b32 v0, s4, s4, s6 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: rotr_v2i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_alignbit_b32 v1, s5, s5, s7 @@ -161,8 +161,8 @@ define amdgpu_kernel void @rotr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; ; SI-LABEL: rotr_v4i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -179,8 +179,8 @@ define amdgpu_kernel void @rotr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; ; GFX8-LABEL: rotr_v4i32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s11 ; GFX8-NEXT: v_mov_b32_e32 v1, s10 @@ -198,22 +198,22 @@ define amdgpu_kernel void @rotr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX10-LABEL: rotr_v4i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_alignbit_b32 v3, s7, s7, s11 ; GFX10-NEXT: v_alignbit_b32 v2, s6, s6, s10 ; GFX10-NEXT: v_alignbit_b32 v1, s5, s5, s9 ; GFX10-NEXT: v_alignbit_b32 v0, s4, s4, s8 -; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: rotr_v4i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_alignbit_b32 v3, s7, s7, s11 diff --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll index acdcb631dccbd..4952f80c0b411 100644 --- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 { ; GFX9-LABEL: s_shl_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -21,7 +21,7 @@ define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 ; ; VI-LABEL: s_shl_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -40,7 +40,7 @@ define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 ; ; CI-LABEL: s_shl_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -59,7 +59,7 @@ define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 ; ; GFX10-LABEL: s_shl_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-NEXT: s_mov_b32 s2, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -71,7 +71,7 @@ define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 ; ; GFX11-LABEL: s_shl_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -90,7 +90,7 @@ define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 define amdgpu_kernel void @v_shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: v_shl_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -101,7 +101,7 @@ define amdgpu_kernel void @v_shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-LABEL: v_shl_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -120,7 +120,7 @@ define amdgpu_kernel void @v_shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; CI-LABEL: v_shl_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -142,7 +142,7 @@ define amdgpu_kernel void @v_shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX10-LABEL: v_shl_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -153,7 +153,9 @@ define amdgpu_kernel void @v_shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX11-LABEL: v_shl_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] @@ -178,20 +180,20 @@ define amdgpu_kernel void @v_shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @shl_v_s_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in, <2 x i16> %sgpr) #0 { ; GFX9-LABEL: shl_v_s_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshlrev_b16 v1, s2, v1 +; GFX9-NEXT: v_pk_lshlrev_b16 v1, s0, v1 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: shl_v_s_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -212,21 +214,21 @@ define amdgpu_kernel void @shl_v_s_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; CI-LABEL: shl_v_s_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dword s8, s[0:1], 0xd -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dword s0, s[2:3], 0xd +; CI-NEXT: s_mov_b32 s11, 0xf000 +; CI-NEXT: s_mov_b32 s10, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[0:1], s[6:7] +; CI-NEXT: s_mov_b64 s[8:9], s[6:7] ; CI-NEXT: v_mov_b32_e32 v1, 0 -; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 -; CI-NEXT: s_lshr_b32 s0, s8, 16 -; CI-NEXT: s_mov_b64 s[6:7], s[2:3] +; CI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; CI-NEXT: s_lshr_b32 s1, s0, 16 +; CI-NEXT: s_mov_b64 s[6:7], s[10:11] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; CI-NEXT: v_lshlrev_b32_e32 v2, s8, v2 -; CI-NEXT: v_lshlrev_b32_e32 v3, s0, v3 +; CI-NEXT: v_lshlrev_b32_e32 v2, s0, v2 +; CI-NEXT: v_lshlrev_b32_e32 v3, s1, v3 ; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 @@ -235,9 +237,10 @@ define amdgpu_kernel void @shl_v_s_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX10-LABEL: shl_v_s_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -247,9 +250,12 @@ define amdgpu_kernel void @shl_v_s_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX11-LABEL: shl_v_s_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -271,20 +277,20 @@ define amdgpu_kernel void @shl_v_s_v2i16(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @shl_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in, <2 x i16> %sgpr) #0 { ; GFX9-LABEL: shl_s_v_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshlrev_b16 v1, v1, s2 +; GFX9-NEXT: v_pk_lshlrev_b16 v1, v1, s0 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: shl_s_v_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -305,21 +311,21 @@ define amdgpu_kernel void @shl_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; CI-LABEL: shl_s_v_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dword s8, s[0:1], 0xd -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dword s0, s[2:3], 0xd +; CI-NEXT: s_mov_b32 s11, 0xf000 +; CI-NEXT: s_mov_b32 s10, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[0:1], s[6:7] +; CI-NEXT: s_mov_b64 s[8:9], s[6:7] ; CI-NEXT: v_mov_b32_e32 v1, 0 -; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 -; CI-NEXT: s_lshr_b32 s0, s8, 16 -; CI-NEXT: s_mov_b64 s[6:7], s[2:3] +; CI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; CI-NEXT: s_lshr_b32 s1, s0, 16 +; CI-NEXT: s_mov_b64 s[6:7], s[10:11] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; CI-NEXT: v_lshl_b32_e32 v2, s8, v2 -; CI-NEXT: v_lshl_b32_e32 v3, s0, v3 +; CI-NEXT: v_lshl_b32_e32 v2, s0, v2 +; CI-NEXT: v_lshl_b32_e32 v3, s1, v3 ; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 @@ -328,9 +334,10 @@ define amdgpu_kernel void @shl_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX10-LABEL: shl_s_v_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -340,9 +347,12 @@ define amdgpu_kernel void @shl_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX11-LABEL: shl_s_v_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -364,7 +374,7 @@ define amdgpu_kernel void @shl_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @shl_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: shl_imm_v_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -375,7 +385,7 @@ define amdgpu_kernel void @shl_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: shl_imm_v_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: v_mov_b32_e32 v4, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -395,7 +405,7 @@ define amdgpu_kernel void @shl_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace( ; ; CI-LABEL: shl_imm_v_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -416,7 +426,7 @@ define amdgpu_kernel void @shl_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10-LABEL: shl_imm_v_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -427,7 +437,9 @@ define amdgpu_kernel void @shl_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11-LABEL: shl_imm_v_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -450,7 +462,7 @@ define amdgpu_kernel void @shl_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @shl_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: shl_v_imm_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -461,7 +473,7 @@ define amdgpu_kernel void @shl_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: shl_v_imm_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -481,7 +493,7 @@ define amdgpu_kernel void @shl_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace( ; ; CI-LABEL: shl_v_imm_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -498,7 +510,7 @@ define amdgpu_kernel void @shl_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10-LABEL: shl_v_imm_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -509,7 +521,9 @@ define amdgpu_kernel void @shl_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11-LABEL: shl_v_imm_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -532,7 +546,7 @@ define amdgpu_kernel void @shl_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @v_shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: v_shl_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] @@ -544,7 +558,7 @@ define amdgpu_kernel void @v_shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-LABEL: v_shl_v4i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -566,7 +580,7 @@ define amdgpu_kernel void @v_shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; CI-LABEL: v_shl_v4i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 @@ -595,7 +609,7 @@ define amdgpu_kernel void @v_shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX10-LABEL: v_shl_v4i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] @@ -607,7 +621,9 @@ define amdgpu_kernel void @v_shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX11-LABEL: v_shl_v4i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] @@ -633,7 +649,7 @@ define amdgpu_kernel void @v_shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @shl_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: shl_v_imm_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -645,7 +661,7 @@ define amdgpu_kernel void @shl_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: shl_v_imm_v4i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -669,7 +685,7 @@ define amdgpu_kernel void @shl_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace( ; ; CI-LABEL: shl_v_imm_v4i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -692,7 +708,7 @@ define amdgpu_kernel void @shl_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10-LABEL: shl_v_imm_v4i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -704,7 +720,9 @@ define amdgpu_kernel void @shl_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11-LABEL: shl_v_imm_v4i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/sign_extend.ll b/llvm/test/CodeGen/AMDGPU/sign_extend.ll index 9a03d216c7a99..b54df3b4d0c6c 100644 --- a/llvm/test/CodeGen/AMDGPU/sign_extend.ll +++ b/llvm/test/CodeGen/AMDGPU/sign_extend.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @s_sext_i1_to_i32(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind { ; SI-LABEL: s_sext_i1_to_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -19,7 +19,7 @@ define amdgpu_kernel void @s_sext_i1_to_i32(ptr addrspace(1) %out, i32 %a, i32 % ; ; VI-LABEL: s_sext_i1_to_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -39,15 +39,13 @@ define amdgpu_kernel void @s_sext_i1_to_i32(ptr addrspace(1) %out, i32 %a, i32 % define amdgpu_kernel void @test_s_sext_i32_to_i64(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) nounwind { ; SI-LABEL: test_s_sext_i32_to_i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mul_i32 s4, s6, s7 -; SI-NEXT: s_add_i32 s4, s4, s8 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mul_i32 s4, s4, s5 +; SI-NEXT: s_add_i32 s4, s4, s6 ; SI-NEXT: s_ashr_i32 s5, s4, 31 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -56,15 +54,13 @@ define amdgpu_kernel void @test_s_sext_i32_to_i64(ptr addrspace(1) %out, i32 %a, ; ; VI-LABEL: test_s_sext_i32_to_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mul_i32 s4, s6, s7 -; VI-NEXT: s_add_i32 s4, s4, s8 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mul_i32 s4, s4, s5 +; VI-NEXT: s_add_i32 s4, s4, s6 ; VI-NEXT: s_ashr_i32 s5, s4, 31 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -81,7 +77,7 @@ entry: define amdgpu_kernel void @s_sext_i1_to_i64(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind { ; SI-LABEL: s_sext_i1_to_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -96,7 +92,7 @@ define amdgpu_kernel void @s_sext_i1_to_i64(ptr addrspace(1) %out, i32 %a, i32 % ; ; VI-LABEL: s_sext_i1_to_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -117,8 +113,8 @@ define amdgpu_kernel void @s_sext_i1_to_i64(ptr addrspace(1) %out, i32 %a, i32 % define amdgpu_kernel void @s_sext_i32_to_i64(ptr addrspace(1) %out, i32 %a) nounwind { ; SI-LABEL: s_sext_i32_to_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -130,8 +126,8 @@ define amdgpu_kernel void @s_sext_i32_to_i64(ptr addrspace(1) %out, i32 %a) noun ; ; VI-LABEL: s_sext_i32_to_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -148,7 +144,7 @@ define amdgpu_kernel void @s_sext_i32_to_i64(ptr addrspace(1) %out, i32 %a) noun define amdgpu_kernel void @v_sext_i32_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: v_sext_i32_to_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -166,7 +162,7 @@ define amdgpu_kernel void @v_sext_i32_to_i64(ptr addrspace(1) %out, ptr addrspac ; ; VI-LABEL: v_sext_i32_to_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -190,8 +186,8 @@ define amdgpu_kernel void @v_sext_i32_to_i64(ptr addrspace(1) %out, ptr addrspac define amdgpu_kernel void @s_sext_i16_to_i64(ptr addrspace(1) %out, i16 %a) nounwind { ; SI-LABEL: s_sext_i16_to_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -203,8 +199,8 @@ define amdgpu_kernel void @s_sext_i16_to_i64(ptr addrspace(1) %out, i16 %a) noun ; ; VI-LABEL: s_sext_i16_to_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -221,7 +217,7 @@ define amdgpu_kernel void @s_sext_i16_to_i64(ptr addrspace(1) %out, i16 %a) noun define amdgpu_kernel void @s_sext_i1_to_i16(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind { ; SI-LABEL: s_sext_i1_to_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -235,7 +231,7 @@ define amdgpu_kernel void @s_sext_i1_to_i16(ptr addrspace(1) %out, i32 %a, i32 % ; ; VI-LABEL: s_sext_i1_to_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -259,8 +255,8 @@ define amdgpu_kernel void @s_sext_i1_to_i16(ptr addrspace(1) %out, i32 %a, i32 % define amdgpu_kernel void @s_sext_i1_to_i16_with_and(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c, i32 %d) nounwind { ; SI-LABEL: s_sext_i1_to_i16_with_and: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -275,8 +271,8 @@ define amdgpu_kernel void @s_sext_i1_to_i16_with_and(ptr addrspace(1) %out, i32 ; ; VI-LABEL: s_sext_i1_to_i16_with_and: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -299,15 +295,13 @@ define amdgpu_kernel void @s_sext_i1_to_i16_with_and(ptr addrspace(1) %out, i32 define amdgpu_kernel void @v_sext_i1_to_i16_with_and(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) nounwind { ; SI-LABEL: v_sext_i1_to_i16_with_and: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_cmp_eq_u32 s7, s8 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v0 +; SI-NEXT: s_cmp_eq_u32 s5, s6 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0 ; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 ; SI-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] @@ -316,15 +310,13 @@ define amdgpu_kernel void @v_sext_i1_to_i16_with_and(ptr addrspace(1) %out, i32 ; ; VI-LABEL: v_sext_i1_to_i16_with_and: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_cmp_eq_u32 s7, s8 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v0 +; VI-NEXT: s_cmp_eq_u32 s5, s6 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0 ; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 ; VI-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] @@ -350,8 +342,8 @@ define amdgpu_kernel void @v_sext_i1_to_i16_with_and(ptr addrspace(1) %out, i32 define amdgpu_kernel void @s_sext_v4i8_to_v4i32(ptr addrspace(1) %out, i32 %a) nounwind { ; SI-LABEL: s_sext_v4i8_to_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -375,8 +367,8 @@ define amdgpu_kernel void @s_sext_v4i8_to_v4i32(ptr addrspace(1) %out, i32 %a) n ; ; VI-LABEL: s_sext_v4i8_to_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -415,7 +407,7 @@ define amdgpu_kernel void @s_sext_v4i8_to_v4i32(ptr addrspace(1) %out, i32 %a) n define amdgpu_kernel void @v_sext_v4i8_to_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: v_sext_v4i8_to_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -443,7 +435,7 @@ define amdgpu_kernel void @v_sext_v4i8_to_v4i32(ptr addrspace(1) %out, ptr addrs ; ; VI-LABEL: v_sext_v4i8_to_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -487,7 +479,7 @@ define amdgpu_kernel void @v_sext_v4i8_to_v4i32(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @s_sext_v4i16_to_v4i32(ptr addrspace(1) %out, i64 %a) nounwind { ; SI-LABEL: s_sext_v4i16_to_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -513,7 +505,7 @@ define amdgpu_kernel void @s_sext_v4i16_to_v4i32(ptr addrspace(1) %out, i64 %a) ; ; VI-LABEL: s_sext_v4i16_to_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -552,7 +544,7 @@ define amdgpu_kernel void @s_sext_v4i16_to_v4i32(ptr addrspace(1) %out, i64 %a) define amdgpu_kernel void @v_sext_v4i16_to_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: v_sext_v4i16_to_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -580,7 +572,7 @@ define amdgpu_kernel void @v_sext_v4i16_to_v4i32(ptr addrspace(1) %out, ptr addr ; ; VI-LABEL: v_sext_v4i16_to_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 diff --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll index f88aaf389ca9a..3644bef9c20a1 100644 --- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll +++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll @@ -50,12 +50,12 @@ define void @local_store_i56(ptr addrspace(3) %ptr, i56 %arg) #0 { define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 { ; HAWAII-LABEL: local_store_i55: ; HAWAII: ; %bb.0: -; HAWAII-NEXT: s_or_b32 s0, s4, 14 +; HAWAII-NEXT: s_or_b32 s0, s6, 14 ; HAWAII-NEXT: v_mov_b32_e32 v0, s0 -; HAWAII-NEXT: v_mov_b32_e32 v1, s5 +; HAWAII-NEXT: v_mov_b32_e32 v1, s7 ; HAWAII-NEXT: flat_load_ubyte v0, v[0:1] -; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x0 -; HAWAII-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; HAWAII-NEXT: s_load_dword s2, s[6:7], 0x0 +; HAWAII-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 ; HAWAII-NEXT: s_mov_b32 m0, -1 ; HAWAII-NEXT: s_waitcnt lgkmcnt(0) ; HAWAII-NEXT: v_mov_b32_e32 v1, s2 @@ -70,12 +70,12 @@ define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 { ; ; FIJI-LABEL: local_store_i55: ; FIJI: ; %bb.0: -; FIJI-NEXT: s_or_b32 s0, s4, 14 +; FIJI-NEXT: s_or_b32 s0, s6, 14 ; FIJI-NEXT: v_mov_b32_e32 v0, s0 -; FIJI-NEXT: v_mov_b32_e32 v1, s5 +; FIJI-NEXT: v_mov_b32_e32 v1, s7 ; FIJI-NEXT: flat_load_ubyte v0, v[0:1] -; FIJI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; FIJI-NEXT: s_load_dword s2, s[4:5], 0x0 +; FIJI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; FIJI-NEXT: s_load_dword s2, s[6:7], 0x0 ; FIJI-NEXT: s_mov_b32 m0, -1 ; FIJI-NEXT: s_waitcnt lgkmcnt(0) ; FIJI-NEXT: s_and_b32 s3, s1, 0xffff @@ -94,9 +94,9 @@ define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 { ; GFX9-LABEL: local_store_i55: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_load_ubyte_d16_hi v0, v0, s[4:5] offset:14 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-NEXT: global_load_ubyte_d16_hi v0, v0, s[6:7] offset:14 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s3, s1, 0xffff ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -114,9 +114,9 @@ define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 { ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-NEXT: global_load_ubyte_d16_hi v0, v0, s[4:5] offset:14 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; GFX10-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX10-NEXT: global_load_ubyte_d16_hi v0, v0, s[6:7] offset:14 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s3, s1, 0xffff ; GFX10-NEXT: v_mov_b32_e32 v1, s2 @@ -133,16 +133,16 @@ define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 { ; GFX11-LABEL: local_store_i55: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: global_load_d16_hi_u8 v0, v0, s[0:1] offset:14 +; GFX11-NEXT: global_load_d16_hi_u8 v0, v0, s[2:3] offset:14 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 +; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s1, s3, 0xffff -; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s3 -; GFX11-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-NEXT: s_and_b32 s3, s1, 0xffff +; GFX11-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s1 +; GFX11-NEXT: v_mov_b32_e32 v3, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_or_b32_e32 v0, s1, v0 +; GFX11-NEXT: v_or_b32_e32 v0, s3, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v0, 0x7fffff, v0 ; GFX11-NEXT: ds_store_b8_d16_hi v1, v0 offset:6 @@ -156,8 +156,8 @@ define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 { define amdgpu_kernel void @local_store_i48(ptr addrspace(3) %ptr, i48 %arg) #0 { ; HAWAII-LABEL: local_store_i48: ; HAWAII: ; %bb.0: -; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x0 -; HAWAII-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; HAWAII-NEXT: s_load_dword s2, s[6:7], 0x0 +; HAWAII-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 ; HAWAII-NEXT: s_mov_b32 m0, -1 ; HAWAII-NEXT: s_waitcnt lgkmcnt(0) ; HAWAII-NEXT: v_mov_b32_e32 v0, s2 @@ -169,8 +169,8 @@ define amdgpu_kernel void @local_store_i48(ptr addrspace(3) %ptr, i48 %arg) #0 { ; ; FIJI-LABEL: local_store_i48: ; FIJI: ; %bb.0: -; FIJI-NEXT: s_load_dword s2, s[4:5], 0x0 -; FIJI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; FIJI-NEXT: s_load_dword s2, s[6:7], 0x0 +; FIJI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 ; FIJI-NEXT: s_mov_b32 m0, -1 ; FIJI-NEXT: s_waitcnt lgkmcnt(0) ; FIJI-NEXT: v_mov_b32_e32 v0, s2 @@ -182,8 +182,8 @@ define amdgpu_kernel void @local_store_i48(ptr addrspace(3) %ptr, i48 %arg) #0 { ; ; GFX9-LABEL: local_store_i48: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -195,8 +195,8 @@ define amdgpu_kernel void @local_store_i48(ptr addrspace(3) %ptr, i48 %arg) #0 { ; GFX10-LABEL: local_store_i48: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 @@ -208,10 +208,10 @@ define amdgpu_kernel void @local_store_i48(ptr addrspace(3) %ptr, i48 %arg) #0 { ; GFX11-LABEL: local_store_i48: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-NEXT: ds_store_b16 v0, v1 offset:4 ; GFX11-NEXT: ds_store_b32 v0, v2 @@ -223,9 +223,9 @@ define amdgpu_kernel void @local_store_i48(ptr addrspace(3) %ptr, i48 %arg) #0 { define amdgpu_kernel void @local_store_i65(ptr addrspace(3) %ptr, i65 %arg) #0 { ; HAWAII-LABEL: local_store_i65: ; HAWAII: ; %bb.0: -; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x4 -; HAWAII-NEXT: s_load_dword s3, s[4:5], 0x0 -; HAWAII-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; HAWAII-NEXT: s_load_dword s2, s[6:7], 0x4 +; HAWAII-NEXT: s_load_dword s3, s[6:7], 0x0 +; HAWAII-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 ; HAWAII-NEXT: s_mov_b32 m0, -1 ; HAWAII-NEXT: s_waitcnt lgkmcnt(0) ; HAWAII-NEXT: s_and_b32 s2, s2, 1 @@ -239,9 +239,9 @@ define amdgpu_kernel void @local_store_i65(ptr addrspace(3) %ptr, i65 %arg) #0 { ; ; FIJI-LABEL: local_store_i65: ; FIJI: ; %bb.0: -; FIJI-NEXT: s_load_dword s2, s[4:5], 0x10 -; FIJI-NEXT: s_load_dword s3, s[4:5], 0x0 -; FIJI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; FIJI-NEXT: s_load_dword s2, s[6:7], 0x10 +; FIJI-NEXT: s_load_dword s3, s[6:7], 0x0 +; FIJI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 ; FIJI-NEXT: s_mov_b32 m0, -1 ; FIJI-NEXT: s_waitcnt lgkmcnt(0) ; FIJI-NEXT: s_and_b32 s2, s2, 1 @@ -255,9 +255,9 @@ define amdgpu_kernel void @local_store_i65(ptr addrspace(3) %ptr, i65 %arg) #0 { ; ; GFX9-LABEL: local_store_i65: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x10 -; GFX9-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x10 +; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 1 ; GFX9-NEXT: v_mov_b32_e32 v2, s3 @@ -271,9 +271,9 @@ define amdgpu_kernel void @local_store_i65(ptr addrspace(3) %ptr, i65 %arg) #0 { ; GFX10-LABEL: local_store_i65: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s2, s[4:5], 0x10 -; GFX10-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-NEXT: s_load_dword s2, s[6:7], 0x10 +; GFX10-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s2, s2, 1 ; GFX10-NEXT: v_mov_b32_e32 v2, s3 @@ -287,13 +287,13 @@ define amdgpu_kernel void @local_store_i65(ptr addrspace(3) %ptr, i65 %arg) #0 { ; GFX11-LABEL: local_store_i65: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x10 -; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x10 +; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s2, 1 +; GFX11-NEXT: s_and_b32 s2, s4, 1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, s2 +; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, s2 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: ds_store_b8 v2, v3 offset:8 ; GFX11-NEXT: ds_store_b64 v2, v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/sub.ll b/llvm/test/CodeGen/AMDGPU/sub.ll index 3ae982089228d..a66226a0ef108 100644 --- a/llvm/test/CodeGen/AMDGPU/sub.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.ll @@ -9,7 +9,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone speculatable define amdgpu_kernel void @s_sub_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; GFX6-LABEL: s_sub_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -22,7 +22,7 @@ define amdgpu_kernel void @s_sub_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; ; GFX8-LABEL: s_sub_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_sub_i32 s2, s2, s3 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -33,7 +33,7 @@ define amdgpu_kernel void @s_sub_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; ; GFX9-LABEL: s_sub_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sub_i32 s0, s6, s7 @@ -43,7 +43,7 @@ define amdgpu_kernel void @s_sub_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; ; GFX12-LABEL: s_sub_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_sub_co_i32 s2, s2, s3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -60,8 +60,8 @@ define amdgpu_kernel void @s_sub_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { define amdgpu_kernel void @s_sub_imm_i32(ptr addrspace(1) %out, i32 %a) { ; GFX6-LABEL: s_sub_imm_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -72,10 +72,10 @@ define amdgpu_kernel void @s_sub_imm_i32(ptr addrspace(1) %out, i32 %a) { ; ; GFX8-LABEL: s_sub_imm_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sub_i32 s2, 0x4d2, s2 +; GFX8-NEXT: s_sub_i32 s2, 0x4d2, s4 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 @@ -84,18 +84,18 @@ define amdgpu_kernel void @s_sub_imm_i32(ptr addrspace(1) %out, i32 %a) { ; ; GFX9-LABEL: s_sub_imm_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sub_i32 s0, 0x4d2, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: s_sub_i32 s2, 0x4d2, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: s_sub_imm_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_sub_co_i32 s2, 0x4d2, s2 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -112,7 +112,7 @@ define amdgpu_kernel void @s_sub_imm_i32(ptr addrspace(1) %out, i32 %a) { define amdgpu_kernel void @test_sub_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: test_sub_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -130,7 +130,7 @@ define amdgpu_kernel void @test_sub_i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX8-LABEL: test_sub_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -144,7 +144,7 @@ define amdgpu_kernel void @test_sub_i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: test_sub_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -155,7 +155,7 @@ define amdgpu_kernel void @test_sub_i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX12-LABEL: test_sub_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[2:3] @@ -176,7 +176,7 @@ define amdgpu_kernel void @test_sub_i32(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @test_sub_imm_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: test_sub_imm_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -194,7 +194,7 @@ define amdgpu_kernel void @test_sub_imm_i32(ptr addrspace(1) %out, ptr addrspace ; ; GFX8-LABEL: test_sub_imm_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -208,7 +208,7 @@ define amdgpu_kernel void @test_sub_imm_i32(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-LABEL: test_sub_imm_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -219,7 +219,7 @@ define amdgpu_kernel void @test_sub_imm_i32(ptr addrspace(1) %out, ptr addrspace ; ; GFX12-LABEL: test_sub_imm_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -238,7 +238,7 @@ define amdgpu_kernel void @test_sub_imm_i32(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @test_sub_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: test_sub_v2i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -257,7 +257,7 @@ define amdgpu_kernel void @test_sub_v2i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX8-LABEL: test_sub_v2i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -272,7 +272,7 @@ define amdgpu_kernel void @test_sub_v2i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX9-LABEL: test_sub_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] @@ -284,7 +284,7 @@ define amdgpu_kernel void @test_sub_v2i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX12-LABEL: test_sub_v2i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b128 v[0:3], v4, s[2:3] @@ -306,7 +306,7 @@ define amdgpu_kernel void @test_sub_v2i32(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @test_sub_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: test_sub_v4i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -328,7 +328,7 @@ define amdgpu_kernel void @test_sub_v4i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX8-LABEL: test_sub_v4i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -350,7 +350,7 @@ define amdgpu_kernel void @test_sub_v4i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX9-LABEL: test_sub_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] offset:16 @@ -365,7 +365,7 @@ define amdgpu_kernel void @test_sub_v4i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX12-LABEL: test_sub_v4i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v8, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 @@ -391,7 +391,7 @@ define amdgpu_kernel void @test_sub_v4i32(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @test_sub_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: test_sub_i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s10, 0 ; GFX6-NEXT: s_mov_b32 s11, s7 @@ -412,7 +412,7 @@ define amdgpu_kernel void @test_sub_i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX8-LABEL: test_sub_i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -432,7 +432,7 @@ define amdgpu_kernel void @test_sub_i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: test_sub_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc @@ -446,9 +446,11 @@ define amdgpu_kernel void @test_sub_i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX12-LABEL: test_sub_i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -472,7 +474,7 @@ define amdgpu_kernel void @test_sub_i16(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: test_sub_v2i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s10, 0 ; GFX6-NEXT: s_mov_b32 s11, s7 @@ -497,7 +499,7 @@ define amdgpu_kernel void @test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX8-LABEL: test_sub_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -515,7 +517,7 @@ define amdgpu_kernel void @test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX9-LABEL: test_sub_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -527,9 +529,11 @@ define amdgpu_kernel void @test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX12-LABEL: test_sub_v2i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -551,7 +555,7 @@ define amdgpu_kernel void @test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @test_sub_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: test_sub_v4i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s10, 0 ; GFX6-NEXT: s_mov_b32 s11, s7 @@ -583,7 +587,7 @@ define amdgpu_kernel void @test_sub_v4i16(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX8-LABEL: test_sub_v4i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -604,7 +608,7 @@ define amdgpu_kernel void @test_sub_v4i16(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX9-LABEL: test_sub_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -617,9 +621,11 @@ define amdgpu_kernel void @test_sub_v4i16(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX12-LABEL: test_sub_v4i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: v_mov_b32_e32 v4, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b128 v[0:3], v0, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -642,8 +648,8 @@ define amdgpu_kernel void @test_sub_v4i16(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @s_sub_i64(ptr addrspace(1) noalias %out, i64 %a, i64 %b) nounwind { ; GFX6-LABEL: s_sub_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -656,8 +662,8 @@ define amdgpu_kernel void @s_sub_i64(ptr addrspace(1) noalias %out, i64 %a, i64 ; ; GFX8-LABEL: s_sub_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_sub_u32 s2, s4, s6 ; GFX8-NEXT: s_subb_u32 s3, s5, s7 @@ -670,22 +676,22 @@ define amdgpu_kernel void @s_sub_i64(ptr addrspace(1) noalias %out, i64 %a, i64 ; ; GFX9-LABEL: s_sub_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sub_u32 s0, s4, s6 -; GFX9-NEXT: s_subb_u32 s1, s5, s7 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: s_sub_u32 s2, s4, s6 +; GFX9-NEXT: s_subb_u32 s3, s5, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: s_sub_i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_sub_nc_u64 s[2:3], s[4:5], s[6:7] ; GFX12-NEXT: v_mov_b32_e32 v2, 0 @@ -702,8 +708,8 @@ define amdgpu_kernel void @s_sub_i64(ptr addrspace(1) noalias %out, i64 %a, i64 define amdgpu_kernel void @v_sub_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %inA, ptr addrspace(1) noalias %inB) nounwind { ; GFX6-LABEL: v_sub_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX6-NEXT: s_mov_b32 s11, 0xf000 ; GFX6-NEXT: s_mov_b32 s14, 0 ; GFX6-NEXT: s_mov_b32 s15, s11 @@ -725,8 +731,8 @@ define amdgpu_kernel void @v_sub_i64(ptr addrspace(1) noalias %out, ptr addrspac ; ; GFX8-LABEL: v_sub_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s7 @@ -747,12 +753,12 @@ define amdgpu_kernel void @v_sub_i64(ptr addrspace(1) noalias %out, ptr addrspac ; ; GFX9-LABEL: v_sub_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2 @@ -763,10 +769,12 @@ define amdgpu_kernel void @v_sub_i64(ptr addrspace(1) noalias %out, ptr addrspac ; GFX12-LABEL: v_sub_i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: v_mov_b32_e32 v4, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[6:7] @@ -791,8 +799,8 @@ define amdgpu_kernel void @v_sub_i64(ptr addrspace(1) noalias %out, ptr addrspac define amdgpu_kernel void @v_test_sub_v2i64(ptr addrspace(1) %out, ptr addrspace(1) noalias %inA, ptr addrspace(1) noalias %inB) { ; GFX6-LABEL: v_test_sub_v2i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX6-NEXT: s_mov_b32 s11, 0xf000 ; GFX6-NEXT: s_mov_b32 s14, 0 ; GFX6-NEXT: s_mov_b32 s15, s11 @@ -816,8 +824,8 @@ define amdgpu_kernel void @v_test_sub_v2i64(ptr addrspace(1) %out, ptr addrspace ; ; GFX8-LABEL: v_test_sub_v2i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s7 @@ -840,12 +848,12 @@ define amdgpu_kernel void @v_test_sub_v2i64(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-LABEL: v_test_sub_v2i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 4, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] -; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] +; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v6 @@ -858,10 +866,12 @@ define amdgpu_kernel void @v_test_sub_v2i64(ptr addrspace(1) %out, ptr addrspace ; GFX12-LABEL: v_test_sub_v2i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: v_mov_b32_e32 v8, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_load_b128 v[0:3], v4, s[6:7] @@ -888,8 +898,8 @@ define amdgpu_kernel void @v_test_sub_v2i64(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @v_test_sub_v4i64(ptr addrspace(1) %out, ptr addrspace(1) noalias %inA, ptr addrspace(1) noalias %inB) { ; GFX6-LABEL: v_test_sub_v4i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX6-NEXT: s_mov_b32 s11, 0xf000 ; GFX6-NEXT: s_mov_b32 s14, 0 ; GFX6-NEXT: s_mov_b32 s15, s11 @@ -921,8 +931,8 @@ define amdgpu_kernel void @v_test_sub_v4i64(ptr addrspace(1) %out, ptr addrspace ; ; GFX8-LABEL: v_test_sub_v4i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 5, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s7 @@ -961,14 +971,14 @@ define amdgpu_kernel void @v_test_sub_v4i64(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-LABEL: v_test_sub_v4i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v16, 5, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] -; GFX9-NEXT: global_load_dwordx4 v[4:7], v16, s[2:3] +; GFX9-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] ; GFX9-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:16 -; GFX9-NEXT: global_load_dwordx4 v[12:15], v16, s[2:3] offset:16 +; GFX9-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:16 ; GFX9-NEXT: v_mov_b32_e32 v16, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v6 @@ -987,10 +997,12 @@ define amdgpu_kernel void @v_test_sub_v4i64(ptr addrspace(1) %out, ptr addrspace ; GFX12-LABEL: v_test_sub_v4i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 -; GFX12-NEXT: v_lshlrev_b32_e32 v12, 5, v0 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: v_mov_b32_e32 v16, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_lshlrev_b32_e32 v12, 5, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x3 ; GFX12-NEXT: global_load_b128 v[0:3], v12, s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll index bfeab97d81dbe..02cc7d1185cf3 100644 --- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll @@ -8,13 +8,13 @@ define amdgpu_kernel void @v_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; GFX9-LABEL: v_test_sub_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 @@ -24,8 +24,8 @@ define amdgpu_kernel void @v_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace ; ; VI-LABEL: v_test_sub_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -49,13 +49,13 @@ define amdgpu_kernel void @v_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace ; GFX10-LABEL: v_test_sub_v2i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 @@ -67,8 +67,10 @@ define amdgpu_kernel void @v_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace ; GFX11-LABEL: v_test_sub_v2i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -96,25 +98,25 @@ define amdgpu_kernel void @v_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @s_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in0, ptr addrspace(4) %in1) #1 { ; GFX9-LABEL: s_test_sub_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s10, s[8:9], 0x0 -; GFX9-NEXT: s_load_dword s11, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s9, s[6:7], 0x0 ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s10 -; GFX9-NEXT: v_pk_sub_i16 v0, s11, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_pk_sub_i16 v0, s9, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: s_test_sub_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -137,23 +139,23 @@ define amdgpu_kernel void @s_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace ; GFX10-LABEL: s_test_sub_v2i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 -; GFX10-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_sub_i16 v0, s0, s1 +; GFX10-NEXT: v_pk_sub_i16 v0, s2, s3 ; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_sub_v2i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[6:7], 0x0 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 @@ -175,7 +177,7 @@ define amdgpu_kernel void @s_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @s_test_sub_self_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in0) #1 { ; GCN-LABEL: s_test_sub_self_v2i16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v0, 0 @@ -185,7 +187,7 @@ define amdgpu_kernel void @s_test_sub_self_v2i16(ptr addrspace(1) %out, ptr addr ; ; GFX10-LABEL: s_test_sub_self_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-NEXT: s_mov_b32 s2, -1 @@ -195,7 +197,7 @@ define amdgpu_kernel void @s_test_sub_self_v2i16(ptr addrspace(1) %out, ptr addr ; ; GFX11-LABEL: s_test_sub_self_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 @@ -214,7 +216,7 @@ define amdgpu_kernel void @s_test_sub_self_v2i16(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @s_test_sub_v2i16_kernarg(ptr addrspace(1) %out, <2 x i16> %a, <2 x i16> %b) #1 { ; GFX9-LABEL: s_test_sub_v2i16_kernarg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -227,7 +229,7 @@ define amdgpu_kernel void @s_test_sub_v2i16_kernarg(ptr addrspace(1) %out, <2 x ; ; VI-LABEL: s_test_sub_v2i16_kernarg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -246,7 +248,7 @@ define amdgpu_kernel void @s_test_sub_v2i16_kernarg(ptr addrspace(1) %out, <2 x ; ; GFX10-LABEL: s_test_sub_v2i16_kernarg: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-NEXT: s_mov_b32 s2, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -258,7 +260,7 @@ define amdgpu_kernel void @s_test_sub_v2i16_kernarg(ptr addrspace(1) %out, <2 x ; ; GFX11-LABEL: s_test_sub_v2i16_kernarg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -277,7 +279,7 @@ define amdgpu_kernel void @s_test_sub_v2i16_kernarg(ptr addrspace(1) %out, <2 x define amdgpu_kernel void @v_test_sub_v2i16_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { ; GFX9-LABEL: v_test_sub_v2i16_constant: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_mov_b32 s4, 0x1c8007b ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -291,7 +293,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_constant(ptr addrspace(1) %out, ptr ; ; VI-LABEL: v_test_sub_v2i16_constant: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -310,7 +312,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_constant(ptr addrspace(1) %out, ptr ; ; GFX10-LABEL: v_test_sub_v2i16_constant: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc @@ -324,7 +326,9 @@ define amdgpu_kernel void @v_test_sub_v2i16_constant(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: v_test_sub_v2i16_constant: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc @@ -349,7 +353,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_constant(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { ; GFX9-LABEL: v_test_sub_v2i16_neg_constant: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_mov_b32 s4, 0xfc21fcb3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -363,7 +367,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(ptr addrspace(1) %out, ; ; VI-LABEL: v_test_sub_v2i16_neg_constant: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -382,7 +386,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(ptr addrspace(1) %out, ; ; GFX10-LABEL: v_test_sub_v2i16_neg_constant: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc @@ -396,7 +400,9 @@ define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(ptr addrspace(1) %out, ; ; GFX11-LABEL: v_test_sub_v2i16_neg_constant: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc @@ -420,7 +426,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { ; GFX9-LABEL: v_test_sub_v2i16_inline_neg1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc @@ -433,7 +439,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(ptr addrspace(1) %out, p ; ; VI-LABEL: v_test_sub_v2i16_inline_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -452,7 +458,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(ptr addrspace(1) %out, p ; ; GFX10-LABEL: v_test_sub_v2i16_inline_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc @@ -466,7 +472,9 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(ptr addrspace(1) %out, p ; ; GFX11-LABEL: v_test_sub_v2i16_inline_neg1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc @@ -490,7 +498,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(ptr addrspace(1) %out, p define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { ; GFX9-LABEL: v_test_sub_v2i16_inline_lo_zero_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc @@ -503,7 +511,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(ptr addrspace(1) % ; ; VI-LABEL: v_test_sub_v2i16_inline_lo_zero_hi: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -521,7 +529,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(ptr addrspace(1) % ; ; GFX10-LABEL: v_test_sub_v2i16_inline_lo_zero_hi: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc @@ -535,7 +543,9 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(ptr addrspace(1) % ; ; GFX11-LABEL: v_test_sub_v2i16_inline_lo_zero_hi: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc @@ -560,7 +570,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(ptr addrspace(1) % define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { ; GFX9-LABEL: v_test_sub_v2i16_inline_fp_split: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc @@ -573,7 +583,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(ptr addrspace(1) %ou ; ; VI-LABEL: v_test_sub_v2i16_inline_fp_split: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -591,7 +601,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(ptr addrspace(1) %ou ; ; GFX10-LABEL: v_test_sub_v2i16_inline_fp_split: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc @@ -605,7 +615,9 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(ptr addrspace(1) %ou ; ; GFX11-LABEL: v_test_sub_v2i16_inline_fp_split: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc @@ -630,13 +642,13 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(ptr addrspace(1) %ou define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 @@ -648,8 +660,8 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(ptr addrspace(1) %out, ; ; VI-LABEL: v_test_sub_v2i16_zext_to_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -672,13 +684,13 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(ptr addrspace(1) %out, ; GFX10-LABEL: v_test_sub_v2i16_zext_to_v2i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 @@ -692,8 +704,10 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(ptr addrspace(1) %out, ; GFX11-LABEL: v_test_sub_v2i16_zext_to_v2i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -726,14 +740,14 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v3, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 @@ -746,8 +760,8 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(ptr addrspace(1) %out, ; ; VI-LABEL: v_test_sub_v2i16_zext_to_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -772,13 +786,13 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(ptr addrspace(1) %out, ; GFX10-LABEL: v_test_sub_v2i16_zext_to_v2i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 @@ -794,8 +808,10 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(ptr addrspace(1) %out, ; GFX11-LABEL: v_test_sub_v2i16_zext_to_v2i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -829,13 +845,13 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 @@ -847,8 +863,8 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(ptr addrspace(1) %out, ; ; VI-LABEL: v_test_sub_v2i16_sext_to_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -873,13 +889,13 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(ptr addrspace(1) %out, ; GFX10-LABEL: v_test_sub_v2i16_sext_to_v2i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 @@ -893,8 +909,10 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(ptr addrspace(1) %out, ; GFX11-LABEL: v_test_sub_v2i16_sext_to_v2i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -927,12 +945,12 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NEXT: global_load_dword v2, v0, s[0:1] ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -947,8 +965,8 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(ptr addrspace(1) %out, ; ; VI-LABEL: v_test_sub_v2i16_sext_to_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -974,13 +992,13 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(ptr addrspace(1) %out, ; GFX10-LABEL: v_test_sub_v2i16_sext_to_v2i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-NEXT: global_load_dword v2, v0, s[0:1] ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s6, -1 @@ -997,8 +1015,10 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(ptr addrspace(1) %out, ; GFX11-LABEL: v_test_sub_v2i16_sext_to_v2i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll index f686aad0cefc2..dfd9a650ff0e9 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: udiv_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -44,7 +44,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; VI-LABEL: udiv_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -80,7 +80,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; GCN-LABEL: udiv_i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -112,7 +112,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; GFX1030-LABEL: udiv_i32: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -185,7 +185,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; SI-LABEL: s_udiv_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -218,7 +218,7 @@ define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; ; VI-LABEL: s_udiv_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -251,7 +251,7 @@ define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; ; GCN-LABEL: s_udiv_i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s3 ; GCN-NEXT: s_sub_i32 s4, 0, s3 @@ -282,7 +282,7 @@ define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; ; GFX1030-LABEL: s_udiv_i32: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_cvt_f32_u32_e32 v0, s3 ; GFX1030-NEXT: s_sub_i32 s5, 0, s3 @@ -346,7 +346,7 @@ define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: udiv_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -401,7 +401,7 @@ define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: udiv_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -456,7 +456,7 @@ define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GCN-LABEL: udiv_v2i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -507,7 +507,7 @@ define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX1030-LABEL: udiv_v2i32: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v4, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] @@ -619,7 +619,7 @@ define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: udiv_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s6, s10 @@ -714,7 +714,7 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: udiv_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s6, s10 @@ -809,7 +809,7 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GCN-LABEL: udiv_v4i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_add_u32 s4, s2, 16 ; GCN-NEXT: s_addc_u32 s5, s3, 0 @@ -904,7 +904,7 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX1030-LABEL: udiv_v4i32: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v8, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: s_clause 0x1 @@ -1098,7 +1098,7 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @udiv_i32_div_pow2(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: udiv_i32_div_pow2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1116,7 +1116,7 @@ define amdgpu_kernel void @udiv_i32_div_pow2(ptr addrspace(1) %out, ptr addrspac ; ; VI-LABEL: udiv_i32_div_pow2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1134,7 +1134,7 @@ define amdgpu_kernel void @udiv_i32_div_pow2(ptr addrspace(1) %out, ptr addrspac ; ; GCN-LABEL: udiv_i32_div_pow2: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -1148,7 +1148,7 @@ define amdgpu_kernel void @udiv_i32_div_pow2(ptr addrspace(1) %out, ptr addrspac ; ; GFX1030-LABEL: udiv_i32_div_pow2: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_dword v1, v0, s[2:3] @@ -1183,7 +1183,7 @@ define amdgpu_kernel void @udiv_i32_div_pow2(ptr addrspace(1) %out, ptr addrspac define amdgpu_kernel void @udiv_i32_div_k_even(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: udiv_i32_div_k_even: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1203,7 +1203,7 @@ define amdgpu_kernel void @udiv_i32_div_k_even(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: udiv_i32_div_k_even: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1223,7 +1223,7 @@ define amdgpu_kernel void @udiv_i32_div_k_even(ptr addrspace(1) %out, ptr addrsp ; ; GCN-LABEL: udiv_i32_div_k_even: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -1239,7 +1239,7 @@ define amdgpu_kernel void @udiv_i32_div_k_even(ptr addrspace(1) %out, ptr addrsp ; ; GFX1030-LABEL: udiv_i32_div_k_even: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_dword v1, v0, s[2:3] @@ -1277,7 +1277,7 @@ define amdgpu_kernel void @udiv_i32_div_k_even(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @udiv_i32_div_k_odd(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: udiv_i32_div_k_odd: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1297,7 +1297,7 @@ define amdgpu_kernel void @udiv_i32_div_k_odd(ptr addrspace(1) %out, ptr addrspa ; ; VI-LABEL: udiv_i32_div_k_odd: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1317,7 +1317,7 @@ define amdgpu_kernel void @udiv_i32_div_k_odd(ptr addrspace(1) %out, ptr addrspa ; ; GCN-LABEL: udiv_i32_div_k_odd: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -1333,7 +1333,7 @@ define amdgpu_kernel void @udiv_i32_div_k_odd(ptr addrspace(1) %out, ptr addrspa ; ; GFX1030-LABEL: udiv_i32_div_k_odd: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_dword v1, v0, s[2:3] @@ -1371,7 +1371,7 @@ define amdgpu_kernel void @udiv_i32_div_k_odd(ptr addrspace(1) %out, ptr addrspa define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_udiv_i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1400,7 +1400,7 @@ define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; VI-LABEL: v_udiv_i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1429,7 +1429,7 @@ define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GCN-LABEL: v_udiv_i8: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -1452,7 +1452,7 @@ define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX1030-LABEL: v_udiv_i8: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_ushort v1, v0, s[2:3] @@ -1511,7 +1511,7 @@ define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_udiv_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1540,7 +1540,7 @@ define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: v_udiv_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1569,7 +1569,7 @@ define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GCN-LABEL: v_udiv_i16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -1592,7 +1592,7 @@ define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX1030-LABEL: v_udiv_i16: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_dword v1, v0, s[2:3] @@ -1651,7 +1651,7 @@ define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_udiv_i23: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1688,7 +1688,7 @@ define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: v_udiv_i23: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1725,7 +1725,7 @@ define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GCN-LABEL: v_udiv_i23: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_add_u32 s4, s2, 4 ; GCN-NEXT: s_addc_u32 s5, s3, 0 @@ -1770,7 +1770,7 @@ define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX1030-LABEL: v_udiv_i23: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: s_clause 0x3 @@ -1848,7 +1848,7 @@ define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_udiv_i24: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1885,7 +1885,7 @@ define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: v_udiv_i24: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1922,7 +1922,7 @@ define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GCN-LABEL: v_udiv_i24: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_add_u32 s4, s2, 4 ; GCN-NEXT: s_addc_u32 s5, s3, 0 @@ -1967,7 +1967,7 @@ define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX1030-LABEL: v_udiv_i24: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: s_clause 0x3 @@ -2048,7 +2048,7 @@ define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @scalarize_mulhu_4xi32(ptr addrspace(1) nocapture readonly %in, ptr addrspace(1) nocapture %out) { ; SI-LABEL: scalarize_mulhu_4xi32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2076,7 +2076,7 @@ define amdgpu_kernel void @scalarize_mulhu_4xi32(ptr addrspace(1) nocapture read ; ; VI-LABEL: scalarize_mulhu_4xi32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2104,7 +2104,7 @@ define amdgpu_kernel void @scalarize_mulhu_4xi32(ptr addrspace(1) nocapture read ; ; GCN-LABEL: scalarize_mulhu_4xi32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 @@ -2130,7 +2130,7 @@ define amdgpu_kernel void @scalarize_mulhu_4xi32(ptr addrspace(1) nocapture read ; ; GFX1030-LABEL: scalarize_mulhu_4xi32: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v4, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1] @@ -2193,7 +2193,7 @@ define amdgpu_kernel void @scalarize_mulhu_4xi32(ptr addrspace(1) nocapture read define amdgpu_kernel void @test_udiv2(i32 %p) { ; SI-LABEL: test_udiv2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2205,7 +2205,7 @@ define amdgpu_kernel void @test_udiv2(i32 %p) { ; ; VI-LABEL: test_udiv2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2217,7 +2217,7 @@ define amdgpu_kernel void @test_udiv2(i32 %p) { ; ; GCN-LABEL: test_udiv2: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s0, s[4:5], 0x0 +; GCN-NEXT: s_load_dword s0, s[6:7], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshr_b32 s0, s0, 1 ; GCN-NEXT: v_mov_b32_e32 v0, s0 @@ -2227,7 +2227,7 @@ define amdgpu_kernel void @test_udiv2(i32 %p) { ; ; GFX1030-LABEL: test_udiv2: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX1030-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: s_lshr_b32 s0, s0, 1 ; GFX1030-NEXT: v_mov_b32_e32 v0, s0 @@ -2253,7 +2253,7 @@ define amdgpu_kernel void @test_udiv2(i32 %p) { define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) { ; SI-LABEL: test_udiv_3_mulhu: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[2:3], 0x9 ; SI-NEXT: v_mov_b32_e32 v0, 0xaaaaaaab ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -2266,7 +2266,7 @@ define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) { ; ; VI-LABEL: test_udiv_3_mulhu: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0xaaaaaaab ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -2279,7 +2279,7 @@ define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) { ; ; GCN-LABEL: test_udiv_3_mulhu: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s0, s[4:5], 0x0 +; GCN-NEXT: s_load_dword s0, s[6:7], 0x0 ; GCN-NEXT: v_mov_b32_e32 v0, 0xaaaaaaab ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mul_hi_u32 v0, s0, v0 @@ -2290,7 +2290,7 @@ define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) { ; ; GFX1030-LABEL: test_udiv_3_mulhu: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX1030-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: s_mul_hi_u32 s0, s0, 0xaaaaaaab ; GFX1030-NEXT: s_lshr_b32 s0, s0, 1 diff --git a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll index 65eb1cee42350..d1bf5ecb56984 100644 --- a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll +++ b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll @@ -13,37 +13,37 @@ declare double @llvm.fabs.f64(double) define amdgpu_kernel void @v_cnd_nan_nosgpr(ptr addrspace(1) %out, i32 %c, ptr addrspace(1) %fptr) #0 { ; SI-LABEL: v_cnd_nan_nosgpr: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s8, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s8, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_cmp_eq_u32 s8, 0 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cnd_nan_nosgpr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_eq_u32 s2, 0 +; VI-NEXT: s_cmp_eq_u32 s4, 0 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc @@ -54,35 +54,37 @@ define amdgpu_kernel void @v_cnd_nan_nosgpr(ptr addrspace(1) %out, i32 %c, ptr a ; ; GFX10-LABEL: v_cnd_nan_nosgpr: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[0:1] ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cmp_eq_u32 s4, 0 ; GFX10-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_cnd_nan_nosgpr: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_eq_u32 s2, 0 +; GFX11-NEXT: s_cmp_eq_u32 s4, 0 ; GFX11-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc @@ -107,7 +109,7 @@ define amdgpu_kernel void @v_cnd_nan_nosgpr(ptr addrspace(1) %out, i32 %c, ptr a define amdgpu_kernel void @v_cnd_nan(ptr addrspace(1) %out, i32 %c, float %f) #0 { ; SI-LABEL: v_cnd_nan: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -122,7 +124,7 @@ define amdgpu_kernel void @v_cnd_nan(ptr addrspace(1) %out, i32 %c, float %f) #0 ; ; VI-LABEL: v_cnd_nan: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 ; VI-NEXT: v_mov_b32_e32 v0, s3 @@ -135,7 +137,7 @@ define amdgpu_kernel void @v_cnd_nan(ptr addrspace(1) %out, i32 %c, float %f) #0 ; ; GFX10-LABEL: v_cnd_nan: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cmp_eq_u32 s6, 0 @@ -146,7 +148,7 @@ define amdgpu_kernel void @v_cnd_nan(ptr addrspace(1) %out, i32 %c, float %f) #0 ; ; GFX11-LABEL: v_cnd_nan: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_cmp_eq_u32 s2, 0 @@ -169,30 +171,30 @@ define amdgpu_kernel void @v_cnd_nan(ptr addrspace(1) %out, i32 %c, float %f) #0 define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %out, [8 x i32], float %x, float %z) #0 { ; SI-LABEL: fcmp_sgprX_k0_select_k1_sgprZ_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x13 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s1 -; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0 ; SI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fcmp_sgprX_k0_select_k1_sgprZ_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x4c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v2, s1 -; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 ; VI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -200,26 +202,27 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %o ; GFX10-LABEL: fcmp_sgprX_k0_select_k1_sgprZ_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cmp_nlg_f32_e64 s[0:1], s2, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, s3, s[0:1] +; GFX10-NEXT: v_cmp_nlg_f32_e64 s[2:3], s0, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, s1, s[2:3] ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_sgprX_k0_select_k1_sgprZ_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x4c +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nlg_f32_e64 s[4:5], s2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, s3, s[4:5] -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: v_cmp_nlg_f32_e64 s[4:5], s0, 0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, s1, s[4:5] +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -235,30 +238,30 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %o define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprX_f32(ptr addrspace(1) %out, float %x) #0 { ; SI-LABEL: fcmp_sgprX_k0_select_k1_sgprX_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s0 -; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0 ; SI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fcmp_sgprX_k0_select_k1_sgprX_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 ; VI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -266,24 +269,25 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprX_f32(ptr addrspace(1) %o ; GFX10-LABEL: fcmp_sgprX_k0_select_k1_sgprX_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cmp_nlg_f32_e64 s[0:1], s4, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, s4, s[0:1] -; GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-NEXT: v_cmp_nlg_f32_e64 s[2:3], s4, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, s4, s[2:3] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_sgprX_k0_select_k1_sgprX_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_nlg_f32_e64 s[2:3], s4, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, s4, s[2:3] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -301,30 +305,30 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprX_f32(ptr addrspace(1) %o define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprZ_f32(ptr addrspace(1) %out, [8 x i32], float %x, float %z) #0 { ; SI-LABEL: fcmp_sgprX_k0_select_k0_sgprZ_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x13 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s1 -; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0 ; SI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fcmp_sgprX_k0_select_k0_sgprZ_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x4c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v2, s1 -; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 ; VI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -332,26 +336,27 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprZ_f32(ptr addrspace(1) %o ; GFX10-LABEL: fcmp_sgprX_k0_select_k0_sgprZ_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cmp_nlg_f32_e64 s[0:1], s2, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, s3, s[0:1] +; GFX10-NEXT: v_cmp_nlg_f32_e64 s[2:3], s0, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, s1, s[2:3] ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_sgprX_k0_select_k0_sgprZ_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x4c +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nlg_f32_e64 s[4:5], s2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, s3, s[4:5] -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: v_cmp_nlg_f32_e64 s[4:5], s0, 0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, s1, s[4:5] +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -367,30 +372,30 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprZ_f32(ptr addrspace(1) %o define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprX_f32(ptr addrspace(1) %out, float %x) #0 { ; SI-LABEL: fcmp_sgprX_k0_select_k0_sgprX_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s0 -; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0 ; SI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fcmp_sgprX_k0_select_k0_sgprX_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 ; VI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -398,24 +403,25 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprX_f32(ptr addrspace(1) %o ; GFX10-LABEL: fcmp_sgprX_k0_select_k0_sgprX_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cmp_nlg_f32_e64 s[0:1], s4, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, s4, s[0:1] -; GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-NEXT: v_cmp_nlg_f32_e64 s[2:3], s4, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, s4, s[2:3] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_sgprX_k0_select_k0_sgprX_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_nlg_f32_e64 s[2:3], s4, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, s4, s[2:3] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -433,16 +439,16 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprX_f32(ptr addrspace(1) %o define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_vgprZ_f32(ptr addrspace(1) %out, float %x, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: fcmp_sgprX_k0_select_k0_vgprZ_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s0, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -451,20 +457,20 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_vgprZ_f32(ptr addrspace(1) %o ; ; VI-LABEL: fcmp_sgprX_k0_select_k0_vgprZ_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 +; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -472,32 +478,34 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_vgprZ_f32(ptr addrspace(1) %o ; ; GFX10-LABEL: fcmp_sgprX_k0_select_k0_vgprZ_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: global_load_dword v1, v0, s[0:1] ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_sgprX_k0_select_k0_vgprZ_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 +; GFX11-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -518,16 +526,16 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_vgprZ_f32(ptr addrspace(1) %o define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %out, float %x, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: fcmp_sgprX_k0_select_k1_vgprZ_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s0, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -536,20 +544,20 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %o ; ; VI-LABEL: fcmp_sgprX_k0_select_k1_vgprZ_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 +; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -557,32 +565,34 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %o ; ; GFX10-LABEL: fcmp_sgprX_k0_select_k1_vgprZ_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: global_load_dword v1, v0, s[0:1] ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_sgprX_k0_select_k1_vgprZ_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 +; GFX11-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -603,8 +613,8 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %o define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, float %z) #0 { ; SI-LABEL: fcmp_vgprX_k0_select_k1_sgprZ_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dword s8, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -622,8 +632,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %o ; ; VI-LABEL: fcmp_vgprX_k0_select_k1_sgprZ_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -642,9 +652,10 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %o ; ; GFX10-LABEL: fcmp_vgprX_k0_select_k1_sgprZ_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -655,9 +666,12 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %o ; ; GFX11-LABEL: fcmp_vgprX_k0_select_k1_sgprZ_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -681,8 +695,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %o define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -702,8 +716,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %o ; ; VI-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -727,13 +741,13 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %o ; GFX10-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_le_f32_e32 vcc, 0, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc @@ -743,8 +757,10 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %o ; GFX11-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -773,8 +789,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %o define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -794,8 +810,8 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(ptr addrspace(1) %o ; ; VI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -819,13 +835,13 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(ptr addrspace(1) %o ; GFX10-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 2, v2, vcc @@ -835,8 +851,10 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(ptr addrspace(1) %o ; GFX11-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -865,8 +883,8 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(ptr addrspace(1) %o define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -887,8 +905,8 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(ptr addrspace(1) %o ; ; VI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -913,13 +931,13 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(ptr addrspace(1) %o ; GFX10-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] glc dlc +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[0:1] ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc @@ -930,8 +948,10 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(ptr addrspace(1) %o ; GFX11-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v4, s[6:7] glc dlc @@ -961,8 +981,8 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(ptr addrspace(1) %o define amdgpu_kernel void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 @@ -987,8 +1007,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(ptr addrspace(1) ; ; VI-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VI-NEXT: v_lshlrev_b32_e32 v5, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1016,14 +1036,14 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(ptr addrspace(1) ; GFX10-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 4, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v6, v4, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] glc dlc +; GFX10-NEXT: global_load_dwordx4 v[0:3], v5, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_nge_f32_e32 vcc, 4.0, v6 ; GFX10-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc @@ -1036,8 +1056,10 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(ptr addrspace(1) ; GFX11-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1070,8 +1092,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(ptr addrspace(1) define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 @@ -1096,8 +1118,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(ptr addrspace(1) ; ; VI-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VI-NEXT: v_lshlrev_b32_e32 v5, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1125,14 +1147,14 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(ptr addrspace(1) ; GFX10-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 4, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v6, v4, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] glc dlc +; GFX10-NEXT: global_load_dwordx4 v[0:3], v5, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_ge_f32_e32 vcc, 4.0, v6 ; GFX10-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc @@ -1145,8 +1167,10 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(ptr addrspace(1) ; GFX11-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1181,8 +1205,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(ptr addrspace(1) define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 @@ -1207,8 +1231,8 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(ptr addrspace(1) ; ; VI-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VI-NEXT: v_lshlrev_b32_e32 v5, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1236,14 +1260,14 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(ptr addrspace(1) ; GFX10-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 4, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v6, v4, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] glc dlc +; GFX10-NEXT: global_load_dwordx4 v[0:3], v5, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_le_f32_e32 vcc, 4.0, v6 ; GFX10-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc @@ -1256,8 +1280,10 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(ptr addrspace(1) ; GFX11-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1290,8 +1316,8 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(ptr addrspace(1) define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s11, 0xf000 @@ -1315,8 +1341,8 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(ptr addrspace(1) %ou ; ; VI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s7 @@ -1343,13 +1369,13 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(ptr addrspace(1) %ou ; GFX10-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v2, v1, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_ubyte v3, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_ubyte v3, v0, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2 ; GFX10-NEXT: v_and_b32_e32 v1, 1, v3 @@ -1362,8 +1388,10 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(ptr addrspace(1) %ou ; GFX11-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v1, s[6:7] glc dlc @@ -1398,8 +1426,8 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(ptr addrspace(1) %ou define amdgpu_kernel void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 @@ -1423,8 +1451,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(ptr addrspace(1) ; ; VI-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VI-NEXT: v_lshlrev_b32_e32 v5, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1451,14 +1479,14 @@ define amdgpu_kernel void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(ptr addrspace(1) ; GFX10-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v4, v2, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[2:3] glc dlc +; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_le_f32_e32 vcc, 0, v4 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x3ff00000, v1, vcc @@ -1469,8 +1497,10 @@ define amdgpu_kernel void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(ptr addrspace(1) ; GFX11-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1502,8 +1532,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(ptr addrspace(1) define amdgpu_kernel void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 @@ -1526,8 +1556,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(ptr addrspace(1) ; ; VI-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VI-NEXT: v_lshlrev_b32_e32 v5, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1553,14 +1583,14 @@ define amdgpu_kernel void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(ptr addrspace(1) ; GFX10-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v4, v2, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[2:3] glc dlc +; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v4 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -1571,8 +1601,10 @@ define amdgpu_kernel void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(ptr addrspace(1) ; GFX11-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1604,8 +1636,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(ptr addrspace(1) define amdgpu_kernel void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1625,8 +1657,8 @@ define amdgpu_kernel void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(ptr addrspace(1) ; ; VI-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -1650,13 +1682,13 @@ define amdgpu_kernel void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(ptr addrspace(1) ; GFX10-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc, 2, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 4.0, v2, vcc @@ -1666,8 +1698,10 @@ define amdgpu_kernel void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(ptr addrspace(1) ; GFX11-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -1697,8 +1731,8 @@ define amdgpu_kernel void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(ptr addrspace(1) define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1722,8 +1756,8 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(ptr add ; ; VI-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -1751,13 +1785,13 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(ptr add ; GFX10-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_nle_f32_e32 vcc, 4.0, v1 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, -1.0, vcc @@ -1771,8 +1805,10 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(ptr add ; GFX11-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -1808,18 +1844,18 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(ptr add define amdgpu_kernel void @v_cndmask_abs_neg_f16(ptr addrspace(1) %out, i32 %c, ptr addrspace(1) %fptr) #0 { ; SI-LABEL: v_cndmask_abs_neg_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s8, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dword s0, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_cmp_lg_u32 s8, 0 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_cmp_lg_u32 s0, 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e64 v1, |v0| ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 @@ -1827,22 +1863,22 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f16(ptr addrspace(1) %out, i32 %c, ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cndmask_abs_neg_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ushort v0, v[0:1] -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s2, 0 +; VI-NEXT: s_cmp_lg_u32 s4, 0 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v1, 0x7fff, v0 @@ -1855,15 +1891,15 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f16(ptr addrspace(1) %out, i32 %c, ; ; GFX10-LABEL: v_cndmask_abs_neg_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ushort v0, v0, s[2:3] +; GFX10-NEXT: global_load_ushort v0, v0, s[0:1] ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cmp_lg_u32 s4, 0 ; GFX10-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1871,21 +1907,23 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f16(ptr addrspace(1) %out, i32 %c, ; GFX10-NEXT: v_and_b32_e32 v1, 0x7fff, v0 ; GFX10-NEXT: v_xor_b32_e32 v0, 0x8000, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX10-NEXT: global_store_short v2, v0, s[2:3] +; GFX10-NEXT: global_store_short v2, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_cndmask_abs_neg_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] +; GFX11-NEXT: global_load_u16 v0, v0, s[0:1] ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_cmp_lg_u32 s4, 0 ; GFX11-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff, v0 @@ -1910,37 +1948,37 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f16(ptr addrspace(1) %out, i32 %c, define amdgpu_kernel void @v_cndmask_abs_neg_f32(ptr addrspace(1) %out, i32 %c, ptr addrspace(1) %fptr) #0 { ; SI-LABEL: v_cndmask_abs_neg_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s8, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s8, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_cmp_lg_u32 s8, 0 -; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cndmask_b32_e64 v0, -v0, |v0|, s[0:1] -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: v_cndmask_b32_e64 v0, -v0, |v0|, s[4:5] +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cndmask_abs_neg_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s2, 0 +; VI-NEXT: s_cmp_lg_u32 s4, 0 ; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cndmask_b32_e64 v2, -v0, |v0|, s[2:3] @@ -1951,35 +1989,37 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f32(ptr addrspace(1) %out, i32 %c, ; ; GFX10-LABEL: v_cndmask_abs_neg_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[0:1] ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cmp_lg_u32 s4, 0 -; GFX10-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX10-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cndmask_b32_e64 v0, -v0, |v0|, s[0:1] -; GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10-NEXT: v_cndmask_b32_e64 v0, -v0, |v0|, s[2:3] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_cndmask_abs_neg_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_cmp_lg_u32 s4, 0 ; GFX11-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cndmask_b32_e64 v0, -v0, |v0|, s[2:3] @@ -2001,18 +2041,18 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f32(ptr addrspace(1) %out, i32 %c, define amdgpu_kernel void @v_cndmask_abs_neg_f64(ptr addrspace(1) %out, i32 %c, ptr addrspace(1) %fptr) #0 { ; SI-LABEL: v_cndmask_abs_neg_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s8, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dword s0, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_cmp_lg_u32 s8, 0 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_cmp_lg_u32 s0, 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1 ; SI-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 @@ -2020,22 +2060,22 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f64(ptr addrspace(1) %out, i32 %c, ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cndmask_abs_neg_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s2, 0 +; VI-NEXT: s_cmp_lg_u32 s4, 0 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1 @@ -2049,15 +2089,15 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f64(ptr addrspace(1) %out, i32 %c, ; ; GFX10-LABEL: v_cndmask_abs_neg_f64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cmp_lg_u32 s4, 0 ; GFX10-NEXT: s_cselect_b64 vcc, -1, 0 @@ -2066,21 +2106,23 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f64(ptr addrspace(1) %out, i32 %c, ; GFX10-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[2:3] +; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_cndmask_abs_neg_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v3, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] +; GFX11-NEXT: global_load_b64 v[0:1], v0, s[0:1] ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_cmp_lg_u32 s4, 0 ; GFX11-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1 diff --git a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll index e3185e189157b..ee99fcc586334 100644 --- a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll +++ b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll @@ -89,7 +89,7 @@ define <2 x i16> @basic_smax_smin(i16 %src0, i16 %src1) { define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg %src0ext, i32 inreg %src1ext) { ; SDAG-VI-LABEL: basic_smax_smin_sgpr: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0xff ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-VI-NEXT: v_max_i16_e64 v1, s2, 0 @@ -104,7 +104,7 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg ; ; SDAG-GFX9-LABEL: basic_smax_smin_sgpr: ; SDAG-GFX9: ; %bb.0: -; SDAG-GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; SDAG-GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; SDAG-GFX9-NEXT: v_mov_b32_e32 v1, 0xff ; SDAG-GFX9-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -117,7 +117,7 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg ; ; SDAG-GFX11-LABEL: basic_smax_smin_sgpr: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; SDAG-GFX11-NEXT: v_mov_b32_e32 v2, 0 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_med3_i16 v0, s2, 0, 0xff @@ -132,7 +132,7 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg ; ; GISEL-VI-LABEL: basic_smax_smin_sgpr: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-VI-NEXT: s_sext_i32_i16 s4, 0 ; GISEL-VI-NEXT: s_sext_i32_i16 s5, 0xff ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -156,7 +156,7 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg ; ; GISEL-GFX9-LABEL: basic_smax_smin_sgpr: ; GISEL-GFX9: ; %bb.0: -; GISEL-GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GISEL-GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GISEL-GFX9-NEXT: s_sext_i32_i16 s0, 0 ; GISEL-GFX9-NEXT: s_sext_i32_i16 s1, 0xff ; GISEL-GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -176,7 +176,7 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg ; ; GISEL-GFX11-LABEL: basic_smax_smin_sgpr: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GISEL-GFX11-NEXT: s_sext_i32_i16 s4, 0 ; GISEL-GFX11-NEXT: s_sext_i32_i16 s5, 0xff ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 @@ -413,13 +413,13 @@ define <2 x i16> @vec_smax_smin(<2 x i16> %src) { define amdgpu_kernel void @vec_smax_smin_sgpr(ptr addrspace(1) %out, <2 x i16> inreg %src) { ; SDAG-VI-LABEL: vec_smax_smin_sgpr: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0xff ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: s_lshr_b32 s3, s2, 16 -; SDAG-VI-NEXT: v_max_i16_e64 v1, s2, 0 -; SDAG-VI-NEXT: v_max_i16_e64 v2, s3, 0 +; SDAG-VI-NEXT: s_lshr_b32 s2, s4, 16 +; SDAG-VI-NEXT: v_max_i16_e64 v1, s4, 0 +; SDAG-VI-NEXT: v_max_i16_e64 v2, s2, 0 ; SDAG-VI-NEXT: v_min_i16_e32 v1, 0xff, v1 ; SDAG-VI-NEXT: v_min_i16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; SDAG-VI-NEXT: v_or_b32_e32 v2, v1, v0 @@ -430,24 +430,24 @@ define amdgpu_kernel void @vec_smax_smin_sgpr(ptr addrspace(1) %out, <2 x i16> i ; ; SDAG-GFX9-LABEL: vec_smax_smin_sgpr: ; SDAG-GFX9: ; %bb.0: -; SDAG-GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; SDAG-GFX9-NEXT: s_movk_i32 s0, 0xff +; SDAG-GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX9-NEXT: s_movk_i32 s2, 0xff ; SDAG-GFX9-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX9-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX9-NEXT: v_pk_max_i16 v1, s4, 0 -; SDAG-GFX9-NEXT: v_pk_min_i16 v1, v1, s0 op_sel_hi:[1,0] -; SDAG-GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX9-NEXT: v_pk_min_i16 v1, v1, s2 op_sel_hi:[1,0] +; SDAG-GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX9-NEXT: s_endpgm ; ; SDAG-GFX11-LABEL: vec_smax_smin_sgpr: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_pk_max_i16 v0, s2, 0 +; SDAG-GFX11-NEXT: v_pk_max_i16 v0, s4, 0 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1] ; SDAG-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -457,24 +457,24 @@ define amdgpu_kernel void @vec_smax_smin_sgpr(ptr addrspace(1) %out, <2 x i16> i ; ; GISEL-VI-LABEL: vec_smax_smin_sgpr: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GISEL-VI-NEXT: s_sext_i32_i16 s3, 0 +; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_sext_i32_i16 s2, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: s_lshr_b32 s4, s2, 16 -; GISEL-VI-NEXT: s_sext_i32_i16 s2, s2 +; GISEL-VI-NEXT: s_lshr_b32 s3, s4, 16 ; GISEL-VI-NEXT: s_sext_i32_i16 s4, s4 -; GISEL-VI-NEXT: s_max_i32 s2, s2, s3 -; GISEL-VI-NEXT: s_max_i32 s3, s4, s3 -; GISEL-VI-NEXT: s_sext_i32_i16 s4, 0xff ; GISEL-VI-NEXT: s_sext_i32_i16 s3, s3 +; GISEL-VI-NEXT: s_max_i32 s4, s4, s2 +; GISEL-VI-NEXT: s_max_i32 s2, s3, s2 +; GISEL-VI-NEXT: s_sext_i32_i16 s3, s4 +; GISEL-VI-NEXT: s_sext_i32_i16 s4, 0xff ; GISEL-VI-NEXT: s_sext_i32_i16 s2, s2 -; GISEL-VI-NEXT: s_min_i32 s3, s3, s4 ; GISEL-VI-NEXT: s_min_i32 s2, s2, s4 -; GISEL-VI-NEXT: s_and_b32 s3, 0xffff, s3 +; GISEL-VI-NEXT: s_min_i32 s3, s3, s4 ; GISEL-VI-NEXT: s_and_b32 s2, 0xffff, s2 -; GISEL-VI-NEXT: s_lshl_b32 s3, s3, 16 -; GISEL-VI-NEXT: s_or_b32 s2, s2, s3 +; GISEL-VI-NEXT: s_and_b32 s3, 0xffff, s3 +; GISEL-VI-NEXT: s_lshl_b32 s2, s2, 16 +; GISEL-VI-NEXT: s_or_b32 s2, s3, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -483,40 +483,40 @@ define amdgpu_kernel void @vec_smax_smin_sgpr(ptr addrspace(1) %out, <2 x i16> i ; ; GISEL-GFX9-LABEL: vec_smax_smin_sgpr: ; GISEL-GFX9: ; %bb.0: -; GISEL-GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GISEL-GFX9-NEXT: s_sext_i32_i16 s0, 0 +; GISEL-GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX9-NEXT: s_sext_i32_i16 s2, 0 ; GISEL-GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX9-NEXT: s_sext_i32_i16 s1, s4 +; GISEL-GFX9-NEXT: s_sext_i32_i16 s3, s4 ; GISEL-GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GISEL-GFX9-NEXT: s_max_i32 s0, s1, s0 -; GISEL-GFX9-NEXT: s_max_i32 s1, s4, 0 -; GISEL-GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1 -; GISEL-GFX9-NEXT: s_sext_i32_i16 s1, s0 -; GISEL-GFX9-NEXT: s_ashr_i32 s0, s0, 16 +; GISEL-GFX9-NEXT: s_max_i32 s2, s3, s2 +; GISEL-GFX9-NEXT: s_max_i32 s3, s4, 0 +; GISEL-GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s3 +; GISEL-GFX9-NEXT: s_sext_i32_i16 s3, s2 +; GISEL-GFX9-NEXT: s_ashr_i32 s2, s2, 16 ; GISEL-GFX9-NEXT: s_sext_i32_i16 s4, 0xff00ff -; GISEL-GFX9-NEXT: s_min_i32 s1, s1, s4 -; GISEL-GFX9-NEXT: s_min_i32 s0, s0, 0xff -; GISEL-GFX9-NEXT: s_pack_ll_b32_b16 s0, s1, s0 -; GISEL-GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX9-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX9-NEXT: s_min_i32 s3, s3, s4 +; GISEL-GFX9-NEXT: s_min_i32 s2, s2, 0xff +; GISEL-GFX9-NEXT: s_pack_ll_b32_b16 s2, s3, s2 +; GISEL-GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX9-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: vec_smax_smin_sgpr: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GISEL-GFX11-NEXT: s_sext_i32_i16 s3, 0 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_sext_i32_i16 s2, 0 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: s_sext_i32_i16 s4, s2 -; GISEL-GFX11-NEXT: s_ashr_i32 s2, s2, 16 -; GISEL-GFX11-NEXT: s_max_i32 s3, s4, s3 -; GISEL-GFX11-NEXT: s_max_i32 s2, s2, 0 +; GISEL-GFX11-NEXT: s_sext_i32_i16 s3, s4 +; GISEL-GFX11-NEXT: s_ashr_i32 s4, s4, 16 +; GISEL-GFX11-NEXT: s_max_i32 s2, s3, s2 +; GISEL-GFX11-NEXT: s_max_i32 s3, s4, 0 ; GISEL-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GISEL-GFX11-NEXT: s_pack_ll_b32_b16 s2, s3, s2 +; GISEL-GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s3 ; GISEL-GFX11-NEXT: s_sext_i32_i16 s3, 0xff00ff ; GISEL-GFX11-NEXT: s_sext_i32_i16 s4, s2 ; GISEL-GFX11-NEXT: s_ashr_i32 s2, s2, 16 diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll index e12a4beb5dbe5..e3cfb5ecaf18e 100644 --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @test_vopc_i32(ptr addrspace(1) %arg) { ; GFX1032-LABEL: test_vopc_i32: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dword v1, v0, s[0:1] @@ -20,7 +20,7 @@ define amdgpu_kernel void @test_vopc_i32(ptr addrspace(1) %arg) { ; ; GFX1064-LABEL: test_vopc_i32: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dword v1, v0, s[0:1] @@ -41,7 +41,7 @@ define amdgpu_kernel void @test_vopc_i32(ptr addrspace(1) %arg) { define amdgpu_kernel void @test_vopc_f32(ptr addrspace(1) %arg) { ; GFX1032-LABEL: test_vopc_f32: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dword v1, v0, s[0:1] @@ -53,7 +53,7 @@ define amdgpu_kernel void @test_vopc_f32(ptr addrspace(1) %arg) { ; ; GFX1064-LABEL: test_vopc_f32: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dword v1, v0, s[0:1] @@ -101,7 +101,7 @@ define amdgpu_ps void @test_vopc_vcmp(float %x) { define amdgpu_kernel void @test_vopc_2xf16(ptr addrspace(1) %arg) { ; GFX1032-LABEL: test_vopc_2xf16: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -114,7 +114,7 @@ define amdgpu_kernel void @test_vopc_2xf16(ptr addrspace(1) %arg) { ; ; GFX1064-LABEL: test_vopc_2xf16: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -138,25 +138,25 @@ define amdgpu_kernel void @test_vopc_class(ptr addrspace(1) %out, float %x) #0 { ; GFX1032-LABEL: test_vopc_class: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_cmp_class_f32_e64 s0, s4, 0x204 -; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 -; GFX1032-NEXT: global_store_dword v0, v1, s[2:3] +; GFX1032-NEXT: v_cmp_class_f32_e64 s2, s4, 0x204 +; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 +; GFX1032-NEXT: global_store_dword v0, v1, s[0:1] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_vopc_class: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_cmp_class_f32_e64 s[0:1], s4, 0x204 -; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] -; GFX1064-NEXT: global_store_dword v0, v1, s[2:3] +; GFX1064-NEXT: v_cmp_class_f32_e64 s[2:3], s4, 0x204 +; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] +; GFX1064-NEXT: global_store_dword v0, v1, s[0:1] ; GFX1064-NEXT: s_endpgm %fabs = tail call float @llvm.fabs.f32(float %x) %cmp = fcmp oeq float %fabs, 0x7FF0000000000000 @@ -169,27 +169,27 @@ define amdgpu_kernel void @test_vcmp_vcnd_f16(ptr addrspace(1) %out, half %x) #0 ; GFX1032-LABEL: test_vcmp_vcnd_f16: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v0, s4 ; GFX1032-NEXT: v_cmp_neq_f16_e64 vcc_lo, 0x7c00, s4 ; GFX1032-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v0, vcc_lo -; GFX1032-NEXT: global_store_short v1, v0, s[2:3] +; GFX1032-NEXT: global_store_short v1, v0, s[0:1] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_vcmp_vcnd_f16: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v0, s4 ; GFX1064-NEXT: v_cmp_neq_f16_e64 vcc, 0x7c00, s4 ; GFX1064-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v0, vcc -; GFX1064-NEXT: global_store_short v1, v0, s[2:3] +; GFX1064-NEXT: global_store_short v1, v0, s[0:1] ; GFX1064-NEXT: s_endpgm %cmp = fcmp oeq half %x, 0x7FF0000000000000 %sel = select i1 %cmp, half 1.0, half %x @@ -200,7 +200,7 @@ define amdgpu_kernel void @test_vcmp_vcnd_f16(ptr addrspace(1) %out, half %x) #0 define amdgpu_kernel void @test_vop3_cmp_f32_sop_and(ptr addrspace(1) %arg) { ; GFX1032-LABEL: test_vop3_cmp_f32_sop_and: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dword v1, v0, s[2:3] @@ -214,7 +214,7 @@ define amdgpu_kernel void @test_vop3_cmp_f32_sop_and(ptr addrspace(1) %arg) { ; ; GFX1064-LABEL: test_vop3_cmp_f32_sop_and: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dword v1, v0, s[2:3] @@ -239,7 +239,7 @@ define amdgpu_kernel void @test_vop3_cmp_f32_sop_and(ptr addrspace(1) %arg) { define amdgpu_kernel void @test_vop3_cmp_i32_sop_xor(ptr addrspace(1) %arg) { ; GFX1032-LABEL: test_vop3_cmp_i32_sop_xor: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dword v1, v0, s[2:3] @@ -253,7 +253,7 @@ define amdgpu_kernel void @test_vop3_cmp_i32_sop_xor(ptr addrspace(1) %arg) { ; ; GFX1064-LABEL: test_vop3_cmp_i32_sop_xor: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dword v1, v0, s[2:3] @@ -278,7 +278,7 @@ define amdgpu_kernel void @test_vop3_cmp_i32_sop_xor(ptr addrspace(1) %arg) { define amdgpu_kernel void @test_vop3_cmp_u32_sop_or(ptr addrspace(1) %arg) { ; GFX1032-LABEL: test_vop3_cmp_u32_sop_or: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dword v1, v0, s[2:3] @@ -292,7 +292,7 @@ define amdgpu_kernel void @test_vop3_cmp_u32_sop_or(ptr addrspace(1) %arg) { ; ; GFX1064-LABEL: test_vop3_cmp_u32_sop_or: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dword v1, v0, s[2:3] @@ -318,10 +318,10 @@ define amdgpu_kernel void @test_mask_if(ptr addrspace(1) %arg) #0 { ; GFX1032-LABEL: test_mask_if: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: v_cmp_lt_u32_e32 vcc_lo, 10, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB9_2 ; GFX1032-NEXT: ; %bb.1: ; %if -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_store_dword v0, v0, s[0:1] @@ -331,10 +331,10 @@ define amdgpu_kernel void @test_mask_if(ptr addrspace(1) %arg) #0 { ; GFX1064-LABEL: test_mask_if: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: v_cmp_lt_u32_e32 vcc, 10, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB9_2 ; GFX1064-NEXT: ; %bb.1: ; %if -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_store_dword v0, v0, s[0:1] @@ -355,7 +355,7 @@ endif: define amdgpu_kernel void @test_loop_with_if(ptr addrspace(1) %arg) #0 { ; GFX1032-LABEL: test_loop_with_if: ; GFX1032: ; %bb.0: ; %bb -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: ; implicit-def: $vgpr2_vgpr3 @@ -417,7 +417,7 @@ define amdgpu_kernel void @test_loop_with_if(ptr addrspace(1) %arg) #0 { ; ; GFX1064-LABEL: test_loop_with_if: ; GFX1064: ; %bb.0: ; %bb -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-NEXT: ; implicit-def: $vgpr2_vgpr3 @@ -516,42 +516,42 @@ define amdgpu_kernel void @test_loop_with_if_else_break(ptr addrspace(1) %arg) # ; GFX1032-LABEL: test_loop_with_if_else_break: ; GFX1032: ; %bb.0: ; %bb ; GFX1032-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_mov_b32 s2, 0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-NEXT: s_mov_b32 s4, 0 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB11_6 ; GFX1032-NEXT: ; %bb.1: ; %.preheader -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: v_min_u32_e32 v1, 0x100, v0 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-NEXT: s_mov_b32 s3, 0 -; GFX1032-NEXT: ; implicit-def: $sgpr4 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: ; implicit-def: $sgpr3 ; GFX1032-NEXT: s_branch .LBB11_4 ; GFX1032-NEXT: .LBB11_2: ; %bb8 ; GFX1032-NEXT: ; in Loop: Header=BB11_4 Depth=1 -; GFX1032-NEXT: s_add_i32 s3, s3, 1 +; GFX1032-NEXT: s_add_i32 s2, s2, 1 ; GFX1032-NEXT: global_store_dword v2, v0, s[0:1] -; GFX1032-NEXT: v_cmp_ge_u32_e32 vcc_lo, s3, v1 +; GFX1032-NEXT: v_cmp_ge_u32_e32 vcc_lo, s2, v1 ; GFX1032-NEXT: s_add_u32 s0, s0, 4 ; GFX1032-NEXT: s_addc_u32 s1, s1, 0 -; GFX1032-NEXT: s_andn2_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_andn2_b32 s3, s3, exec_lo ; GFX1032-NEXT: s_and_b32 s5, vcc_lo, exec_lo -; GFX1032-NEXT: s_or_b32 s4, s4, s5 +; GFX1032-NEXT: s_or_b32 s3, s3, s5 ; GFX1032-NEXT: .LBB11_3: ; %Flow ; GFX1032-NEXT: ; in Loop: Header=BB11_4 Depth=1 -; GFX1032-NEXT: s_and_b32 s5, exec_lo, s4 -; GFX1032-NEXT: s_or_b32 s2, s5, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, s3 +; GFX1032-NEXT: s_or_b32 s4, s5, s4 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX1032-NEXT: s_cbranch_execz .LBB11_6 ; GFX1032-NEXT: .LBB11_4: ; %bb2 ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dword v3, v2, s[0:1] -; GFX1032-NEXT: s_or_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_or_b32 s3, s3, exec_lo ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_gt_i32_e32 vcc_lo, 11, v3 ; GFX1032-NEXT: s_cbranch_vccz .LBB11_2 ; GFX1032-NEXT: ; %bb.5: ; in Loop: Header=BB11_4 Depth=1 -; GFX1032-NEXT: ; implicit-def: $sgpr3 +; GFX1032-NEXT: ; implicit-def: $sgpr2 ; GFX1032-NEXT: ; implicit-def: $sgpr0_sgpr1 ; GFX1032-NEXT: s_branch .LBB11_3 ; GFX1032-NEXT: .LBB11_6: ; %.loopexit @@ -561,10 +561,10 @@ define amdgpu_kernel void @test_loop_with_if_else_break(ptr addrspace(1) %arg) # ; GFX1064: ; %bb.0: ; %bb ; GFX1064-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_mov_b32 s6, 0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB11_6 ; GFX1064-NEXT: ; %bb.1: ; %.preheader -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: v_min_u32_e32 v1, 0x100, v0 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 @@ -631,7 +631,7 @@ bb8: define amdgpu_kernel void @test_addc_vop2b(ptr addrspace(1) %arg, i64 %arg1) #0 { ; GFX1032-LABEL: test_addc_vop2b: ; GFX1032: ; %bb.0: ; %bb -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -643,7 +643,7 @@ define amdgpu_kernel void @test_addc_vop2b(ptr addrspace(1) %arg, i64 %arg1) #0 ; ; GFX1064-LABEL: test_addc_vop2b: ; GFX1064: ; %bb.0: ; %bb -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -664,7 +664,7 @@ bb: define amdgpu_kernel void @test_subbrev_vop2b(ptr addrspace(1) %arg, i64 %arg1) #0 { ; GFX1032-LABEL: test_subbrev_vop2b: ; GFX1032: ; %bb.0: ; %bb -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -676,7 +676,7 @@ define amdgpu_kernel void @test_subbrev_vop2b(ptr addrspace(1) %arg, i64 %arg1) ; ; GFX1064-LABEL: test_subbrev_vop2b: ; GFX1064: ; %bb.0: ; %bb -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -697,7 +697,7 @@ bb: define amdgpu_kernel void @test_subb_vop2b(ptr addrspace(1) %arg, i64 %arg1) #0 { ; GFX1032-LABEL: test_subb_vop2b: ; GFX1032: ; %bb.0: ; %bb -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -709,7 +709,7 @@ define amdgpu_kernel void @test_subb_vop2b(ptr addrspace(1) %arg, i64 %arg1) #0 ; ; GFX1064-LABEL: test_subb_vop2b: ; GFX1064: ; %bb.0: ; %bb -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -730,7 +730,7 @@ bb: define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; GFX1032-LABEL: test_udiv64: ; GFX1032: ; %bb.0: ; %bb -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -892,7 +892,7 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; ; GFX1064-LABEL: test_udiv64: ; GFX1064: ; %bb.0: ; %bb -; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -1063,7 +1063,7 @@ bb: define amdgpu_kernel void @test_div_scale_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX1032-LABEL: test_div_scale_f32: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dword v1, v0, s[2:3] glc dlc @@ -1077,7 +1077,7 @@ define amdgpu_kernel void @test_div_scale_f32(ptr addrspace(1) %out, ptr addrspa ; ; GFX1064-LABEL: test_div_scale_f32: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dword v1, v0, s[2:3] glc dlc @@ -1104,31 +1104,33 @@ define amdgpu_kernel void @test_div_scale_f32(ptr addrspace(1) %out, ptr addrspa define amdgpu_kernel void @test_div_scale_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %in) #0 { ; GFX1032-LABEL: test_div_scale_f64: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX1032-NEXT: v_lshlrev_b32_e32 v4, 3, v0 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] glc dlc +; GFX1032-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] glc dlc ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:8 glc dlc +; GFX1032-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:8 glc dlc ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_div_scale_f64 v[0:1], s2, v[0:1], v[2:3], v[0:1] +; GFX1032-NEXT: v_div_scale_f64 v[0:1], s0, v[0:1], v[2:3], v[0:1] +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_div_scale_f64: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX1064-NEXT: v_lshlrev_b32_e32 v4, 3, v0 -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] glc dlc +; GFX1064-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] glc dlc ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:8 glc dlc +; GFX1064-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:8 glc dlc ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[2:3], v[0:1] +; GFX1064-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], v[2:3], v[0:1] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1064-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -1186,8 +1188,8 @@ define amdgpu_kernel void @test_div_fmas_f32(ptr addrspace(1) %out, float %a, fl ; GFX1032-LABEL: test_div_fmas_f32: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v0, s5 @@ -1195,14 +1197,14 @@ define amdgpu_kernel void @test_div_fmas_f32(ptr addrspace(1) %out, float %a, fl ; GFX1032-NEXT: s_bitcmp1_b32 s7, 0 ; GFX1032-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX1032-NEXT: v_div_fmas_f32 v0, s4, v0, v1 -; GFX1032-NEXT: global_store_dword v2, v0, s[2:3] +; GFX1032-NEXT: global_store_dword v2, v0, s[0:1] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_div_fmas_f32: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v0, s5 @@ -1210,7 +1212,7 @@ define amdgpu_kernel void @test_div_fmas_f32(ptr addrspace(1) %out, float %a, fl ; GFX1064-NEXT: s_bitcmp1_b32 s7, 0 ; GFX1064-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX1064-NEXT: v_div_fmas_f32 v0, s4, v0, v1 -; GFX1064-NEXT: global_store_dword v2, v0, s[2:3] +; GFX1064-NEXT: global_store_dword v2, v0, s[0:1] ; GFX1064-NEXT: s_endpgm %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %d) nounwind readnone store float %result, ptr addrspace(1) %out, align 4 @@ -1221,14 +1223,14 @@ define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, d ; GFX1032-LABEL: test_div_fmas_f64: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x44 +; GFX1032-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x44 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v0, s8 ; GFX1032-NEXT: v_mov_b32_e32 v1, s9 ; GFX1032-NEXT: v_mov_b32_e32 v2, s10 ; GFX1032-NEXT: v_mov_b32_e32 v3, s11 -; GFX1032-NEXT: s_bitcmp1_b32 s2, 0 +; GFX1032-NEXT: s_bitcmp1_b32 s0, 0 ; GFX1032-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX1032-NEXT: v_div_fmas_f64 v[0:1], s[6:7], v[0:1], v[2:3] ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 @@ -1238,14 +1240,14 @@ define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, d ; GFX1064-LABEL: test_div_fmas_f64: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 -; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x44 +; GFX1064-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x44 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v0, s8 ; GFX1064-NEXT: v_mov_b32_e32 v1, s9 ; GFX1064-NEXT: v_mov_b32_e32 v2, s10 ; GFX1064-NEXT: v_mov_b32_e32 v3, s11 -; GFX1064-NEXT: s_bitcmp1_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 s0, 0 ; GFX1064-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX1064-NEXT: v_div_fmas_f64 v[0:1], s[6:7], v[0:1], v[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 @@ -1261,11 +1263,9 @@ define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, d define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(1) %dummy) #0 { ; GFX1032-LABEL: test_div_fmas_f32_i1_phi_vcc: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX1032-NEXT: s_mov_b32 null, 0 +; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x34 ; GFX1032-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 ; GFX1032-NEXT: s_mov_b32 vcc_lo, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -1290,10 +1290,9 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, p ; GFX1064-LABEL: test_div_fmas_f32_i1_phi_vcc: ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX1064-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX1064-NEXT: s_mov_b32 null, 0 ; GFX1064-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v0 ; GFX1064-NEXT: s_mov_b64 vcc, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -1344,7 +1343,7 @@ exit: define amdgpu_kernel void @fdiv_f32(ptr addrspace(1) %out, float %a, float %b) #0 { ; GFX1032-LABEL: fdiv_f32: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_div_scale_f32 v0, s0, s7, s7, s6 ; GFX1032-NEXT: v_rcp_f32_e32 v1, v0 @@ -1363,7 +1362,7 @@ define amdgpu_kernel void @fdiv_f32(ptr addrspace(1) %out, float %a, float %b) # ; ; GFX1064-LABEL: fdiv_f32: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_div_scale_f32 v0, s[0:1], s7, s7, s6 ; GFX1064-NEXT: v_rcp_f32_e32 v1, v0 @@ -1389,13 +1388,13 @@ define amdgpu_kernel void @test_br_cc_f16( ; GFX1032-LABEL: test_br_cc_f16: ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_clause 0x1 ; GFX1032-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX1032-NEXT: global_load_ushort v2, v0, s[2:3] +; GFX1032-NEXT: global_load_ushort v2, v0, s[0:1] ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v1, v2 ; GFX1032-NEXT: s_cbranch_vccnz .LBB24_2 @@ -1409,13 +1408,13 @@ define amdgpu_kernel void @test_br_cc_f16( ; GFX1064-LABEL: test_br_cc_f16: ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_clause 0x1 ; GFX1064-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX1064-NEXT: global_load_ushort v2, v0, s[2:3] +; GFX1064-NEXT: global_load_ushort v2, v0, s[0:1] ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_cmp_nlt_f16_e32 vcc, v1, v2 ; GFX1064-NEXT: s_cbranch_vccnz .LBB24_2 @@ -1446,12 +1445,12 @@ two: define amdgpu_kernel void @test_brcc_i1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i1 %val) #0 { ; GCN-LABEL: test_brcc_i1: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_bitcmp0_b32 s2, 0 +; GCN-NEXT: s_bitcmp0_b32 s0, 0 ; GCN-NEXT: s_cbranch_scc1 .LBB25_2 ; GCN-NEXT: ; %bb.1: ; %store -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: v_mov_b32_e32 v1, 0xde ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -1473,14 +1472,14 @@ define amdgpu_kernel void @test_preserve_condition_undef_flag(float %arg, i32 %a ; GFX1032-LABEL: test_preserve_condition_undef_flag: ; GFX1032: ; %bb.0: ; %bb0 ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x24 +; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX1032-NEXT: s_load_dword s1, s[2:3], 0x24 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_cmp_nlt_f32_e64 s0, s2, 1.0 -; GFX1032-NEXT: v_cmp_nlt_f32_e64 s1, s3, 1.0 -; GFX1032-NEXT: v_cmp_ngt_f32_e64 s2, s2, 0 -; GFX1032-NEXT: s_or_b32 s0, s0, s1 -; GFX1032-NEXT: s_or_b32 s0, s0, s2 +; GFX1032-NEXT: v_cmp_nlt_f32_e64 s2, s0, 1.0 +; GFX1032-NEXT: v_cmp_nlt_f32_e64 s1, s1, 1.0 +; GFX1032-NEXT: v_cmp_ngt_f32_e64 s0, s0, 0 +; GFX1032-NEXT: s_or_b32 s1, s2, s1 +; GFX1032-NEXT: s_or_b32 s0, s1, s0 ; GFX1032-NEXT: s_and_b32 vcc_lo, exec_lo, s0 ; GFX1032-NEXT: s_cbranch_vccnz .LBB26_2 ; GFX1032-NEXT: ; %bb.1: ; %bb1 @@ -1493,11 +1492,11 @@ define amdgpu_kernel void @test_preserve_condition_undef_flag(float %arg, i32 %a ; GFX1064-LABEL: test_preserve_condition_undef_flag: ; GFX1064: ; %bb.0: ; %bb0 ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX1064-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX1064-NEXT: s_load_dword s5, s[2:3], 0x24 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_cmp_nlt_f32_e64 s[0:1], s4, 1.0 -; GFX1064-NEXT: v_cmp_nlt_f32_e64 s[2:3], s2, 1.0 +; GFX1064-NEXT: v_cmp_nlt_f32_e64 s[2:3], s5, 1.0 ; GFX1064-NEXT: v_cmp_ngt_f32_e64 s[4:5], s4, 0 ; GFX1064-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] ; GFX1064-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] @@ -1531,7 +1530,7 @@ bb2: define amdgpu_kernel void @test_invert_true_phi_cond_break_loop(i32 %arg) #0 { ; GFX1032-LABEL: test_invert_true_phi_cond_break_loop: ; GFX1032: ; %bb.0: ; %bb -; GFX1032-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x24 ; GFX1032-NEXT: ; implicit-def: $sgpr1 ; GFX1032-NEXT: ; implicit-def: $sgpr2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -1569,7 +1568,7 @@ define amdgpu_kernel void @test_invert_true_phi_cond_break_loop(i32 %arg) #0 { ; ; GFX1064-LABEL: test_invert_true_phi_cond_break_loop: ; GFX1064: ; %bb.0: ; %bb -; GFX1064-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x24 ; GFX1064-NEXT: ; implicit-def: $sgpr2_sgpr3 ; GFX1064-NEXT: ; implicit-def: $sgpr4 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -1634,7 +1633,7 @@ define amdgpu_kernel void @test_movrels_extract_neg_offset_vgpr(ptr addrspace(1) ; GFX1032-LABEL: test_movrels_extract_neg_offset_vgpr: ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: v_add_nc_u32_e32 v0, 0xfffffe00, v0 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo @@ -1649,7 +1648,7 @@ define amdgpu_kernel void @test_movrels_extract_neg_offset_vgpr(ptr addrspace(1) ; GFX1064-LABEL: test_movrels_extract_neg_offset_vgpr: ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: v_add_nc_u32_e32 v0, 0xfffffe00, v0 -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc @@ -1672,29 +1671,29 @@ define amdgpu_kernel void @test_set_inactive(ptr addrspace(1) %out, i32 %in) #0 ; GFX1032-LABEL: test_set_inactive: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v0, s4 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032-NEXT: v_mov_b32_e32 v0, 42 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: global_store_dword v1, v0, s[2:3] +; GFX1032-NEXT: global_store_dword v1, v0, s[0:1] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_set_inactive: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v0, s4 ; GFX1064-NEXT: s_not_b64 exec, exec ; GFX1064-NEXT: v_mov_b32_e32 v0, 42 ; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: global_store_dword v1, v0, s[2:3] +; GFX1064-NEXT: global_store_dword v1, v0, s[0:1] ; GFX1064-NEXT: s_endpgm %tmp = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) store i32 %tmp, ptr addrspace(1) %out @@ -1704,7 +1703,7 @@ define amdgpu_kernel void @test_set_inactive(ptr addrspace(1) %out, i32 %in) #0 define amdgpu_kernel void @test_set_inactive_64(ptr addrspace(1) %out, i64 %in) #0 { ; GFX1032-LABEL: test_set_inactive_64: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v0, s2 @@ -1718,7 +1717,7 @@ define amdgpu_kernel void @test_set_inactive_64(ptr addrspace(1) %out, i64 %in) ; ; GFX1064-LABEL: test_set_inactive_64: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v0, s2 @@ -2138,7 +2137,7 @@ main_body: define amdgpu_kernel void @test_intr_fcmp_i64(ptr addrspace(1) %out, float %src, float %a) { ; GFX1032-LABEL: test_intr_fcmp_i64: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_cmp_eq_f32_e64 s0, s6, |s7| @@ -2148,7 +2147,7 @@ define amdgpu_kernel void @test_intr_fcmp_i64(ptr addrspace(1) %out, float %src, ; ; GFX1064-LABEL: test_intr_fcmp_i64: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_cmp_eq_f32_e64 s[0:1], s6, |s7| @@ -2166,26 +2165,26 @@ define amdgpu_kernel void @test_intr_icmp_i64(ptr addrspace(1) %out, i32 %src) { ; GFX1032-LABEL: test_intr_icmp_i64: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u32_e64 s0, 0x64, s4 -; GFX1032-NEXT: v_mov_b32_e32 v0, s0 -; GFX1032-NEXT: global_store_dwordx2 v1, v[0:1], s[2:3] +; GFX1032-NEXT: v_cmp_eq_u32_e64 s2, 0x64, s4 +; GFX1032-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_intr_icmp_i64: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u32_e64 s[0:1], 0x64, s4 -; GFX1064-NEXT: v_mov_b32_e32 v0, s0 -; GFX1064-NEXT: v_mov_b32_e32 v1, s1 -; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX1064-NEXT: v_cmp_eq_u32_e64 s[2:3], 0x64, s4 +; GFX1064-NEXT: v_mov_b32_e32 v0, s2 +; GFX1064-NEXT: v_mov_b32_e32 v1, s3 +; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1064-NEXT: s_endpgm %result = call i64 @llvm.amdgcn.icmp.i64.i32(i32 %src, i32 100, i32 32) store i64 %result, ptr addrspace(1) %out @@ -2195,7 +2194,7 @@ define amdgpu_kernel void @test_intr_icmp_i64(ptr addrspace(1) %out, i32 %src) { define amdgpu_kernel void @test_intr_fcmp_i32(ptr addrspace(1) %out, float %src, float %a) { ; GFX1032-LABEL: test_intr_fcmp_i32: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_cmp_eq_f32_e64 s0, s6, |s7| @@ -2205,7 +2204,7 @@ define amdgpu_kernel void @test_intr_fcmp_i32(ptr addrspace(1) %out, float %src, ; ; GFX1064-LABEL: test_intr_fcmp_i32: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_cmp_eq_f32_e64 s[0:1], s6, |s7| @@ -2222,25 +2221,25 @@ define amdgpu_kernel void @test_intr_icmp_i32(ptr addrspace(1) %out, i32 %src) { ; GFX1032-LABEL: test_intr_icmp_i32: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u32_e64 s0, 0x64, s4 -; GFX1032-NEXT: v_mov_b32_e32 v1, s0 -; GFX1032-NEXT: global_store_dword v0, v1, s[2:3] +; GFX1032-NEXT: v_cmp_eq_u32_e64 s2, 0x64, s4 +; GFX1032-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-NEXT: global_store_dword v0, v1, s[0:1] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_intr_icmp_i32: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u32_e64 s[0:1], 0x64, s4 -; GFX1064-NEXT: v_mov_b32_e32 v1, s0 -; GFX1064-NEXT: global_store_dword v0, v1, s[2:3] +; GFX1064-NEXT: v_cmp_eq_u32_e64 s[2:3], 0x64, s4 +; GFX1064-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-NEXT: global_store_dword v0, v1, s[0:1] ; GFX1064-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i32.i32(i32 %src, i32 100, i32 32) store i32 %result, ptr addrspace(1) %out @@ -2354,7 +2353,7 @@ define amdgpu_ps float @test_ps_live() #0 { define amdgpu_kernel void @test_vccnz_ifcvt_triangle64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX1032-LABEL: test_vccnz_ifcvt_triangle64: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -2374,7 +2373,7 @@ define amdgpu_kernel void @test_vccnz_ifcvt_triangle64(ptr addrspace(1) %out, pt ; ; GFX1064-LABEL: test_vccnz_ifcvt_triangle64: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -2471,7 +2470,7 @@ main_body: define amdgpu_kernel void @icmp64(i32 %n, i32 %s) { ; GFX1032-LABEL: icmp64: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dword s0, s[0:1], 0x28 +; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x28 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_cvt_f32_u32_e32 v1, s0 ; GFX1032-NEXT: s_sub_i32 s1, 0, s0 @@ -2505,7 +2504,7 @@ define amdgpu_kernel void @icmp64(i32 %n, i32 %s) { ; ; GFX1064-LABEL: icmp64: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dword s0, s[0:1], 0x28 +; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x28 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_cvt_f32_u32_e32 v1, s0 ; GFX1064-NEXT: s_sub_i32 s1, 0, s0 @@ -2566,7 +2565,7 @@ if.end2: ; preds = %if.end define amdgpu_kernel void @fcmp64(float %n, float %s) { ; GFX1032-LABEL: fcmp64: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dword s0, s[0:1], 0x28 +; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x28 ; GFX1032-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_div_scale_f32 v1, s1, s0, s0, v0 @@ -2598,7 +2597,7 @@ define amdgpu_kernel void @fcmp64(float %n, float %s) { ; ; GFX1064-LABEL: fcmp64: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x28 +; GFX1064-NEXT: s_load_dword s2, s[2:3], 0x28 ; GFX1064-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_div_scale_f32 v1, s[0:1], s2, s2, v0 @@ -2658,7 +2657,7 @@ if.end2: ; preds = %if.end define amdgpu_kernel void @icmp32(i32 %n, i32 %s) { ; GFX1032-LABEL: icmp32: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dword s0, s[0:1], 0x28 +; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x28 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_cvt_f32_u32_e32 v1, s0 ; GFX1032-NEXT: s_sub_i32 s1, 0, s0 @@ -2692,7 +2691,7 @@ define amdgpu_kernel void @icmp32(i32 %n, i32 %s) { ; ; GFX1064-LABEL: icmp32: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dword s0, s[0:1], 0x28 +; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x28 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_cvt_f32_u32_e32 v1, s0 ; GFX1064-NEXT: s_sub_i32 s1, 0, s0 @@ -2752,7 +2751,7 @@ if.end2: ; preds = %if.end define amdgpu_kernel void @fcmp32(float %n, float %s) { ; GFX1032-LABEL: fcmp32: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dword s0, s[0:1], 0x28 +; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x28 ; GFX1032-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_div_scale_f32 v1, s1, s0, s0, v0 @@ -2784,7 +2783,7 @@ define amdgpu_kernel void @fcmp32(float %n, float %s) { ; ; GFX1064-LABEL: fcmp32: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x28 +; GFX1064-NEXT: s_load_dword s2, s[2:3], 0x28 ; GFX1064-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_div_scale_f32 v1, s[0:1], s2, s2, v0 diff --git a/llvm/test/CodeGen/AMDGPU/xor.ll b/llvm/test/CodeGen/AMDGPU/xor.ll index e15fd7f29671a..9fac17f33d0d3 100644 --- a/llvm/test/CodeGen/AMDGPU/xor.ll +++ b/llvm/test/CodeGen/AMDGPU/xor.ll @@ -5,31 +5,31 @@ define amdgpu_kernel void @xor_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: xor_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s14, s2 -; SI-NEXT: s_mov_b32 s15, s3 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_xor_b32_e32 v1, v3, v1 ; SI-NEXT: v_xor_b32_e32 v0, v2, v0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: xor_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -54,33 +54,33 @@ define amdgpu_kernel void @xor_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in define amdgpu_kernel void @xor_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: xor_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s14, s2 -; SI-NEXT: s_mov_b32 s15, s3 -; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_xor_b32_e32 v3, v7, v3 ; SI-NEXT: v_xor_b32_e32 v2, v6, v2 ; SI-NEXT: v_xor_b32_e32 v1, v5, v1 ; SI-NEXT: v_xor_b32_e32 v0, v4, v0 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: xor_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -107,8 +107,8 @@ define amdgpu_kernel void @xor_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in define amdgpu_kernel void @xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: xor_i1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s2, s10 @@ -133,8 +133,8 @@ define amdgpu_kernel void @xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ; ; VI-LABEL: xor_i1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -165,32 +165,32 @@ define amdgpu_kernel void @xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, define amdgpu_kernel void @v_xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: v_xor_i1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s15, s3 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: buffer_load_ubyte v0, off, s[12:15], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 glc +; SI-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 ; SI-NEXT: v_xor_b32_e32 v0, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 1, v0 -; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_byte v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_xor_i1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -216,30 +216,30 @@ define amdgpu_kernel void @v_xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0 define amdgpu_kernel void @vector_xor_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: vector_xor_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s15, s3 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_xor_b32_e32 v0, v0, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: vector_xor_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -263,7 +263,7 @@ define amdgpu_kernel void @vector_xor_i32(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @scalar_xor_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; SI-LABEL: scalar_xor_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -276,7 +276,7 @@ define amdgpu_kernel void @scalar_xor_i32(ptr addrspace(1) %out, i32 %a, i32 %b) ; ; VI-LABEL: scalar_xor_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_xor_b32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -292,8 +292,8 @@ define amdgpu_kernel void @scalar_xor_i32(ptr addrspace(1) %out, i32 %a, i32 %b) define amdgpu_kernel void @scalar_not_i32(ptr addrspace(1) %out, i32 %a) { ; SI-LABEL: scalar_not_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -304,10 +304,10 @@ define amdgpu_kernel void @scalar_not_i32(ptr addrspace(1) %out, i32 %a) { ; ; VI-LABEL: scalar_not_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_not_b32 s2, s2 +; VI-NEXT: s_not_b32 s2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -321,7 +321,7 @@ define amdgpu_kernel void @scalar_not_i32(ptr addrspace(1) %out, i32 %a) { define amdgpu_kernel void @vector_not_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: vector_not_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -339,7 +339,7 @@ define amdgpu_kernel void @vector_not_i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: vector_not_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -360,31 +360,31 @@ define amdgpu_kernel void @vector_not_i32(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @vector_xor_i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: vector_xor_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s14, s2 -; SI-NEXT: s_mov_b32 s15, s3 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_xor_b32_e32 v0, v2, v0 ; SI-NEXT: v_xor_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: vector_xor_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -409,8 +409,8 @@ define amdgpu_kernel void @vector_xor_i64(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @scalar_xor_i64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; SI-LABEL: scalar_xor_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -424,8 +424,8 @@ define amdgpu_kernel void @scalar_xor_i64(ptr addrspace(1) %out, i64 %a, i64 %b) ; ; VI-LABEL: scalar_xor_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_xor_b64 s[0:1], s[6:7], s[0:1] @@ -442,7 +442,7 @@ define amdgpu_kernel void @scalar_xor_i64(ptr addrspace(1) %out, i64 %a, i64 %b) define amdgpu_kernel void @scalar_not_i64(ptr addrspace(1) %out, i64 %a) { ; SI-LABEL: scalar_not_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -456,7 +456,7 @@ define amdgpu_kernel void @scalar_not_i64(ptr addrspace(1) %out, i64 %a) { ; ; VI-LABEL: scalar_not_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -473,7 +473,7 @@ define amdgpu_kernel void @scalar_not_i64(ptr addrspace(1) %out, i64 %a) { define amdgpu_kernel void @vector_not_i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: vector_not_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -492,7 +492,7 @@ define amdgpu_kernel void @vector_not_i64(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: vector_not_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -514,7 +514,7 @@ define amdgpu_kernel void @vector_not_i64(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @xor_cf(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %a, i64 %b) { ; SI-LABEL: xor_cf: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-NEXT: s_mov_b64 s[8:9], 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u64_e64 s[10:11], s[4:5], 0 @@ -545,7 +545,7 @@ define amdgpu_kernel void @xor_cf(ptr addrspace(1) %out, ptr addrspace(1) %in, i ; ; VI-LABEL: xor_cf: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_mov_b64 s[8:9], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u64 s[4:5], 0 @@ -591,8 +591,8 @@ endif: define amdgpu_kernel void @scalar_xor_literal_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) { ; SI-LABEL: scalar_xor_literal_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -605,15 +605,15 @@ define amdgpu_kernel void @scalar_xor_literal_i64(ptr addrspace(1) %out, [8 x i3 ; ; VI-LABEL: scalar_xor_literal_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_xor_b32 s3, s3, 0xf237b -; VI-NEXT: s_xor_b32 s2, s2, 0x3039 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_xor_b32 s1, s1, 0xf237b +; VI-NEXT: s_xor_b32 s0, s0, 0x3039 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm %or = xor i64 %a, 4261135838621753 @@ -624,30 +624,30 @@ define amdgpu_kernel void @scalar_xor_literal_i64(ptr addrspace(1) %out, [8 x i3 define amdgpu_kernel void @scalar_xor_literal_multi_use_i64(ptr addrspace(1) %out, [8 x i32], i64 %a, i64 %b) { ; SI-LABEL: scalar_xor_literal_multi_use_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x13 ; SI-NEXT: s_movk_i32 s8, 0x3039 ; SI-NEXT: s_mov_b32 s9, 0xf237b -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_xor_b64 s[0:1], s[0:1], s[8:9] -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: s_add_u32 s0, s2, 0x3039 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; SI-NEXT: s_addc_u32 s1, s3, 0xf237b +; SI-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_add_u32 s0, s6, 0x3039 +; SI-NEXT: s_addc_u32 s1, s7, 0xf237b ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_endpgm ; ; VI-LABEL: scalar_xor_literal_multi_use_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_movk_i32 s2, 0x3039 ; VI-NEXT: s_mov_b32 s3, 0xf237b ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -675,8 +675,8 @@ define amdgpu_kernel void @scalar_xor_literal_multi_use_i64(ptr addrspace(1) %ou define amdgpu_kernel void @scalar_xor_inline_imm_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) { ; SI-LABEL: scalar_xor_inline_imm_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -688,14 +688,14 @@ define amdgpu_kernel void @scalar_xor_inline_imm_i64(ptr addrspace(1) %out, [8 x ; ; VI-LABEL: scalar_xor_inline_imm_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_xor_b32 s2, s2, 63 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_xor_b32 s0, s0, 63 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm %or = xor i64 %a, 63 @@ -706,8 +706,8 @@ define amdgpu_kernel void @scalar_xor_inline_imm_i64(ptr addrspace(1) %out, [8 x define amdgpu_kernel void @scalar_xor_neg_inline_imm_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) { ; SI-LABEL: scalar_xor_neg_inline_imm_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -719,14 +719,14 @@ define amdgpu_kernel void @scalar_xor_neg_inline_imm_i64(ptr addrspace(1) %out, ; ; VI-LABEL: scalar_xor_neg_inline_imm_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_xor_b64 s[2:3], s[2:3], -8 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_xor_b64 s[0:1], s[0:1], -8 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm %or = xor i64 %a, -8 @@ -737,7 +737,7 @@ define amdgpu_kernel void @scalar_xor_neg_inline_imm_i64(ptr addrspace(1) %out, define amdgpu_kernel void @vector_xor_i64_neg_inline_imm(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { ; SI-LABEL: vector_xor_i64_neg_inline_imm: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -756,7 +756,7 @@ define amdgpu_kernel void @vector_xor_i64_neg_inline_imm(ptr addrspace(1) %out, ; ; VI-LABEL: vector_xor_i64_neg_inline_imm: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -777,7 +777,7 @@ define amdgpu_kernel void @vector_xor_i64_neg_inline_imm(ptr addrspace(1) %out, define amdgpu_kernel void @vector_xor_literal_i64(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { ; SI-LABEL: vector_xor_literal_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -796,7 +796,7 @@ define amdgpu_kernel void @vector_xor_literal_i64(ptr addrspace(1) %out, ptr add ; ; VI-LABEL: vector_xor_literal_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 From 276fb59d72fd556d74eeec255fa14a53ab26ecb4 Mon Sep 17 00:00:00 2001 From: Christudasan Devadasan Date: Tue, 23 Jul 2024 03:42:17 +0530 Subject: [PATCH 8/8] fix lit tests after rebase. --- llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll | 294 - .../CodeGen/AMDGPU/llvm.amdgcn.permlane.ll | 10152 ++++++++++++++-- llvm/test/CodeGen/AMDGPU/memory_clause.ll | 4 +- llvm/test/CodeGen/AMDGPU/preload-kernargs.ll | 2142 ++-- 4 files changed, 10170 insertions(+), 2422 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll index 988b23d0fe1cc..95d8ca391b843 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll @@ -416,298 +416,4 @@ define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16> ret <2 x i16> %ret } -define float @flat_atomic_fadd_f32_intrinsic_ret__posoffset(ptr %ptr, float %data) { -; GFX940-LABEL: flat_atomic_fadd_f32_intrinsic_ret__posoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:4092 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: flat_atomic_fadd_f32_intrinsic_ret__posoffset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:4092 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 1023 - %result = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %gep, float %data) - ret float %result -} - -define float @flat_atomic_fadd_f32_intrinsic_ret__negoffset(ptr %ptr, float %data) { -; GFX940-LABEL: flat_atomic_fadd_f32_intrinsic_ret__negoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffc00, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: flat_atomic_fadd_f32_intrinsic_ret__negoffset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:-1024 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 -256 - %result = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %gep, float %data) - ret float %result -} - -define void @flat_atomic_fadd_f32_intrinsic_noret__posoffset(ptr %ptr, float %data) { -; GFX940-LABEL: flat_atomic_fadd_f32_intrinsic_noret__posoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:4092 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: flat_atomic_fadd_f32_intrinsic_noret__posoffset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:4092 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 1023 - %unused = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %gep, float %data) - ret void -} - -define void @flat_atomic_fadd_f32_intrinsic_noret__negoffset(ptr %ptr, float %data) { -; GFX940-LABEL: flat_atomic_fadd_f32_intrinsic_noret__negoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffc00, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: flat_atomic_fadd_f32_intrinsic_noret__negoffset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:-1024 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 -256 - %unused = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %gep, float %data) - ret void -} - -define <2 x half> @flat_atomic_fadd_v2f16_intrinsic_ret__posoffset(ptr %ptr, <2 x half> %data) { -; GFX940-LABEL: flat_atomic_fadd_v2f16_intrinsic_ret__posoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:4092 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: flat_atomic_fadd_v2f16_intrinsic_ret__posoffset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:4092 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr %ptr, i64 1023 - %result = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %gep, <2 x half> %data) - ret <2 x half> %result -} - -define <2 x half> @flat_atomic_fadd_v2f16_intrinsic_ret__negoffset(ptr %ptr, <2 x half> %data) { -; GFX940-LABEL: flat_atomic_fadd_v2f16_intrinsic_ret__negoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffc00, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: flat_atomic_fadd_v2f16_intrinsic_ret__negoffset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:-1024 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr %ptr, i64 -256 - %result = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %gep, <2 x half> %data) - ret <2 x half> %result -} - -define void @flat_atomic_fadd_v2f16_intrinsic_noret__posoffset(ptr %ptr, <2 x half> %data) { -; GFX940-LABEL: flat_atomic_fadd_v2f16_intrinsic_noret__posoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:4092 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: flat_atomic_fadd_v2f16_intrinsic_noret__posoffset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:4092 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr %ptr, i64 1023 - %unused = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %gep, <2 x half> %data) - ret void -} - -define void @flat_atomic_fadd_v2f16_intrinsic_noret__negoffset(ptr %ptr, <2 x half> %data) { -; GFX940-LABEL: flat_atomic_fadd_v2f16_intrinsic_noret__negoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffc00, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: flat_atomic_fadd_v2f16_intrinsic_noret__negoffset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:-1024 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr %ptr, i64 -256 - %unused = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %gep, <2 x half> %data) - ret void -} - -define <2 x i16> @flat_atomic_fadd_v2bf16_intrinsic_ret__posoffset(ptr %ptr, <2 x i16> %data) { -; GFX940-LABEL: flat_atomic_fadd_v2bf16_intrinsic_ret__posoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:4092 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: flat_atomic_fadd_v2bf16_intrinsic_ret__posoffset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:4092 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x i16>, ptr %ptr, i64 1023 - %result = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0.v2bf16(ptr %gep, <2 x i16> %data) - ret <2 x i16> %result -} - -define <2 x i16> @flat_atomic_fadd_v2bf16_intrinsic_ret__negoffset(ptr %ptr, <2 x i16> %data) { -; GFX940-LABEL: flat_atomic_fadd_v2bf16_intrinsic_ret__negoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffc00, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: flat_atomic_fadd_v2bf16_intrinsic_ret__negoffset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:-1024 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x i16>, ptr %ptr, i64 -256 - %result = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0.v2bf16(ptr %gep, <2 x i16> %data) - ret <2 x i16> %result -} - -define void @flat_atomic_fadd_v2bf16_intrinsic_noret__posoffset(ptr %ptr, <2 x i16> %data) { -; GFX940-LABEL: flat_atomic_fadd_v2bf16_intrinsic_noret__posoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:4092 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: flat_atomic_fadd_v2bf16_intrinsic_noret__posoffset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:4092 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x i16>, ptr %ptr, i64 1023 - %unused = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0.v2bf16(ptr %gep, <2 x i16> %data) - ret void -} - -define void @flat_atomic_fadd_v2bf16_intrinsic_noret__negoffset(ptr %ptr, <2 x i16> %data) { -; GFX940-LABEL: flat_atomic_fadd_v2bf16_intrinsic_noret__negoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffc00, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: flat_atomic_fadd_v2bf16_intrinsic_noret__negoffset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:-1024 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x i16>, ptr %ptr, i64 -256 - %unused = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0.v2bf16(ptr %gep, <2 x i16> %data) - ret void -} - attributes #0 = { "denormal-fp-math-f32"="ieee,ieee" } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll index 430fc46592553..aad74410d1453 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll @@ -11,8 +11,8 @@ declare i32 @llvm.amdgcn.permlanex16(i32, i32, i32, i32, i1, i1) declare i32 @llvm.amdgcn.workitem.id.x() declare i32 @llvm.amdgcn.workitem.id.y() -define amdgpu_kernel void @v_permlane16_b32_vss(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlane16_b32_vss: +define amdgpu_kernel void @v_permlane16_b32_vss_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlane16_b32_vss_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 @@ -24,7 +24,7 @@ define amdgpu_kernel void @v_permlane16_b32_vss(ptr addrspace(1) %out, i32 %src0 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: v_permlane16_b32_vss: +; GFX11-LABEL: v_permlane16_b32_vss_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 @@ -38,7 +38,7 @@ define amdgpu_kernel void @v_permlane16_b32_vss(ptr addrspace(1) %out, i32 %src0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: v_permlane16_b32_vss: +; GFX12-LABEL: v_permlane16_b32_vss_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 @@ -51,13 +51,252 @@ define amdgpu_kernel void @v_permlane16_b32_vss(ptr addrspace(1) %out, i32 %src0 ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm - %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) + %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) store i32 %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlane16_b32_vii(ptr addrspace(1) %out, i32 %src0) { -; GFX10-LABEL: v_permlane16_b32_vii: +define amdgpu_kernel void @v_permlane16_b32_vss_f32(ptr addrspace(1) %out, float %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlane16_b32_vss_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s0 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlane16_b32_vss_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlane16_b32 v0, v0, s7, s0 +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlane16_b32_vss_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlane16_b32 v0, v0, s7, s0 +; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store float %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_vss_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_b32_vss_i64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlane16_b32_vss_i64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlane16_b32_vss_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlane16_b32_vss_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlane16_b32_vss_i64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16_b32_vss_i64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_vss_f64(ptr addrspace(1) %out, double %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_b32_vss_f64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlane16_b32_vss_f64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlane16_b32_vss_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlane16_b32_vss_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlane16_b32_vss_f64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16_b32_vss_f64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store double %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_vii_i32(ptr addrspace(1) %out, i32 %src0) { +; GFX10-LABEL: v_permlane16_b32_vii_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c @@ -69,7 +308,7 @@ define amdgpu_kernel void @v_permlane16_b32_vii(ptr addrspace(1) %out, i32 %src0 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: v_permlane16_b32_vii: +; GFX11-LABEL: v_permlane16_b32_vii_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c @@ -83,7 +322,7 @@ define amdgpu_kernel void @v_permlane16_b32_vii(ptr addrspace(1) %out, i32 %src0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: v_permlane16_b32_vii: +; GFX12-LABEL: v_permlane16_b32_vii_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -94,587 +333,680 @@ define amdgpu_kernel void @v_permlane16_b32_vii(ptr addrspace(1) %out, i32 %src0 ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm - %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 1, i32 2, i1 false, i1 false) + %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 1, i32 2, i1 false, i1 false) store i32 %v, ptr addrspace(1) %out ret void } -; FIXME-GFX10PLUS: It is allowed to have both immediates as literals -define amdgpu_kernel void @v_permlane16_b32_vll(ptr addrspace(1) %out, i32 %src0) { -; GFX10-LABEL: v_permlane16_b32_vll: +define amdgpu_kernel void @v_permlane16_b32_vii_f32(ptr addrspace(1) %out, float %src0) { +; GFX10-LABEL: v_permlane16_b32_vii_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX10-NEXT: s_movk_i32 s2, 0x1234 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 +; GFX10-NEXT: v_permlane16_b32 v0, v0, 1, 2 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: v_permlane16_b32_vll: +; GFX11-LABEL: v_permlane16_b32_vii_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: s_movk_i32 s2, 0x1234 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 +; GFX11-NEXT: v_permlane16_b32 v0, v0, 1, 2 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: v_permlane16_b32_vll: +; GFX12-LABEL: v_permlane16_b32_vii_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX12-NEXT: s_movk_i32 s2, 0x1234 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlane16_b32 v0, v0, 1, 2 ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm - %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 4660, i32 49617, i1 false, i1 false) - store i32 %v, ptr addrspace(1) %out + %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 1, i32 2, i1 false, i1 false) + store float %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlane16_b32_vvv(ptr addrspace(1) %out, i32 %src0) { -; GFX10-LABEL: v_permlane16_b32_vvv: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX10-NEXT: s_mov_b32 null, 0 -; GFX10-NEXT: v_readfirstlane_b32 s2, v0 -; GFX10-NEXT: v_readfirstlane_b32 s3, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] -; GFX10-NEXT: s_endpgm +define amdgpu_kernel void @v_permlane16_b32_vii_i64(ptr addrspace(1) %out, i64 %src0) { +; GFX10-SDAG-LABEL: v_permlane16_b32_vii_i64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, 1, 2 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, 1, 2 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlane16_b32_vii_i64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, 1, 2 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, 1, 2 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm ; -; GFX11-SDAG-LABEL: v_permlane16_b32_vvv: +; GFX11-SDAG-LABEL: v_permlane16_b32_vii_i64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s4 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 -; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, 1, 2 +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, 1, 2 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; -; GFX11-GISEL-LABEL: v_permlane16_b32_vvv: +; GFX11-GISEL-LABEL: v_permlane16_b32_vii_i64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v0 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s2, v1 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 -; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, 1, 2 +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, 1, 2 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; -; GFX12-SDAG-LABEL: v_permlane16_b32_vvv: +; GFX12-SDAG-LABEL: v_permlane16_b32_vii_i64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s3, v1 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 -; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, 1, 2 +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, 1, 2 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; -; GFX12-GISEL-LABEL: v_permlane16_b32_vvv: +; GFX12-GISEL-LABEL: v_permlane16_b32_vii_i64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s3, v1 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 -; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, 1, 2 +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, 1, 2 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %tidy = call i32 @llvm.amdgcn.workitem.id.y() - %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %tidx, i32 %tidy, i1 false, i1 false) - store i32 %v, ptr addrspace(1) %out + %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 1, i32 2, i1 false, i1 false) + store i64 %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlane16_b32_vvs(ptr addrspace(1) %out, i32 %src0, i32 %src2) { -; GFX10-SDAG-LABEL: v_permlane16_b32_vvs: +define amdgpu_kernel void @v_permlane16_b32_vii_f64(ptr addrspace(1) %out, double %src0) { +; GFX10-SDAG-LABEL: v_permlane16_b32_vii_f64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s7 -; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, 1, 2 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, 1, 2 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; -; GFX10-GISEL-LABEL: v_permlane16_b32_vvs: +; GFX10-GISEL-LABEL: v_permlane16_b32_vii_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s7 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, 1, 2 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, 1, 2 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; -; GFX11-SDAG-LABEL: v_permlane16_b32_vvs: +; GFX11-SDAG-LABEL: v_permlane16_b32_vii_f64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 -; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, 1, 2 +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, 1, 2 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; -; GFX11-GISEL-LABEL: v_permlane16_b32_vvs: +; GFX11-GISEL-LABEL: v_permlane16_b32_vii_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s3 -; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, 1, 2 +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, 1, 2 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; -; GFX12-SDAG-LABEL: v_permlane16_b32_vvs: +; GFX12-SDAG-LABEL: v_permlane16_b32_vii_f64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 -; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, 1, 2 +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, 1, 2 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; -; GFX12-GISEL-LABEL: v_permlane16_b32_vvs: +; GFX12-GISEL-LABEL: v_permlane16_b32_vii_f64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s3 -; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, 1, 2 +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, 1, 2 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %tidx, i32 %src2, i1 false, i1 false) - store i32 %v, ptr addrspace(1) %out + %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 1, i32 2, i1 false, i1 false) + store double %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlane16_b32_vsv(ptr addrspace(1) %out, i32 %src0, i32 %src1) { -; GFX10-LABEL: v_permlane16_b32_vsv: +; FIXME-GFX10PLUS: It is allowed to have both immediates as literals +define amdgpu_kernel void @v_permlane16_b32_vll_i32(ptr addrspace(1) %out, i32 %src0) { +; GFX10-LABEL: v_permlane16_b32_vll_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_movk_i32 s2, 0x1234 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; -; GFX11-SDAG-LABEL: v_permlane16_b32_vsv: +; GFX11-LABEL: v_permlane16_b32_vll_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_movk_i32 s2, 0x1234 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlane16_b32_vll_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX12-NEXT: s_movk_i32 s2, 0x1234 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 4660, i32 49617, i1 false, i1 false) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_vll_i64(ptr addrspace(1) %out, i64 %src0) { +; GFX10-SDAG-LABEL: v_permlane16_b32_vll_i64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_movk_i32 s0, 0x1234 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, 0xc1d1 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, 0xc1d1 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlane16_b32_vll_i64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_movk_i32 s0, 0x1234 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, 0xc1d1 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, 0xc1d1 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlane16_b32_vll_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 -; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-SDAG-NEXT: s_movk_i32 s2, 0x1234 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; -; GFX11-GISEL-LABEL: v_permlane16_b32_vsv: +; GFX11-GISEL-LABEL: v_permlane16_b32_vll_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 -; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-GISEL-NEXT: s_movk_i32 s2, 0x1234 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; -; GFX12-SDAG-LABEL: v_permlane16_b32_vsv: +; GFX12-SDAG-LABEL: v_permlane16_b32_vll_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 -; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x1234 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; -; GFX12-GISEL-LABEL: v_permlane16_b32_vsv: +; GFX12-GISEL-LABEL: v_permlane16_b32_vll_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 -; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-GISEL-NEXT: s_movk_i32 s2, 0x1234 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm - %tidy = call i32 @llvm.amdgcn.workitem.id.y() - %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %tidy, i1 false, i1 false) - store i32 %v, ptr addrspace(1) %out + %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 4660, i32 49617, i1 false, i1 false) + store i64 %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlane16_b32_vss_fi(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlane16_b32_vss_fi: +define amdgpu_kernel void @v_permlane16_b32_vll_f32(ptr addrspace(1) %out,float %src0) { +; GFX10-LABEL: v_permlane16_b32_vll_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_movk_i32 s2, 0x1234 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,0] -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: v_permlane16_b32_vss_fi: +; GFX11-LABEL: v_permlane16_b32_vll_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_movk_i32 s2, 0x1234 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,0] -; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: v_permlane16_b32_vss_fi: +; GFX12-LABEL: v_permlane16_b32_vll_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,0] -; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX12-NEXT: s_movk_i32 s2, 0x1234 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm - %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 false) - store i32 %v, ptr addrspace(1) %out + %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 4660, i32 49617, i1 false, i1 false) + store float %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlane16_b32_vss_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlane16_b32_vss_bc: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[0,1] -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] -; GFX10-NEXT: s_endpgm +define amdgpu_kernel void @v_permlane16_b32_vll_f64(ptr addrspace(1) %out, double %src0) { +; GFX10-SDAG-LABEL: v_permlane16_b32_vll_f64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_movk_i32 s0, 0x1234 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, 0xc1d1 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, 0xc1d1 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm ; -; GFX11-LABEL: v_permlane16_b32_vss_bc: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[0,1] -; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX10-GISEL-LABEL: v_permlane16_b32_vll_f64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_movk_i32 s0, 0x1234 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, 0xc1d1 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, 0xc1d1 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm ; -; GFX12-LABEL: v_permlane16_b32_vss_bc: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[0,1] -; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm - %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 true) - store i32 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlane16_b32_vss_fi_bc: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,1] -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] -; GFX10-NEXT: s_endpgm +; GFX11-SDAG-LABEL: v_permlane16_b32_vll_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-SDAG-NEXT: s_movk_i32 s2, 0x1234 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm ; -; GFX11-LABEL: v_permlane16_b32_vss_fi_bc: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,1] -; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-GISEL-LABEL: v_permlane16_b32_vll_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-GISEL-NEXT: s_movk_i32 s2, 0x1234 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm ; -; GFX12-LABEL: v_permlane16_b32_vss_fi_bc: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,1] -; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm - %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 true) - store i32 %v, ptr addrspace(1) %out +; GFX12-SDAG-LABEL: v_permlane16_b32_vll_f64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x1234 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16_b32_vll_f64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-GISEL-NEXT: s_movk_i32 s2, 0x1234 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 4660, i32 49617, i1 false, i1 false) + store double %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlanex16_b32_vss(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlanex16_b32_vss: +define amdgpu_kernel void @v_permlane16_b32_vvv_i32(ptr addrspace(1) %out, i32 %src0) { +; GFX10-LABEL: v_permlane16_b32_vvv_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_mov_b32 null, 0 +; GFX10-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10-NEXT: v_readfirstlane_b32 s3, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: v_permlanex16_b32_vss: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s7, s0 -; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm -; -; GFX12-LABEL: v_permlanex16_b32_vss: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s7, s0 -; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm - %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) - store i32 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlanex16_b32_vii(ptr addrspace(1) %out, i32 %src0) { -; GFX10-LABEL: v_permlanex16_b32_vii: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, 1, 2 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] -; GFX10-NEXT: s_endpgm +; GFX11-SDAG-LABEL: v_permlane16_b32_vvv_i32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm ; -; GFX11-LABEL: v_permlanex16_b32_vii: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, 1, 2 -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-GISEL-LABEL: v_permlane16_b32_vvv_i32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm ; -; GFX12-LABEL: v_permlanex16_b32_vii: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, 1, 2 -; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm - %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 1, i32 2, i1 false, i1 false) +; GFX12-SDAG-LABEL: v_permlane16_b32_vvv_i32: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s3, v1 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 +; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16_b32_vvv_i32: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s3, v1 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 +; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %tidx, i32 %tidy, i1 false, i1 false) store i32 %v, ptr addrspace(1) %out ret void } -; FIXME-GFX10PLUS: It is allowed to have both immediates as literals -define amdgpu_kernel void @v_permlanex16_b32_vll(ptr addrspace(1) %out, i32 %src0) { -; GFX10-LABEL: v_permlanex16_b32_vll: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX10-NEXT: s_movk_i32 s2, 0x1234 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] -; GFX10-NEXT: s_endpgm +define amdgpu_kernel void @v_permlane16_b32_vvv_i64(ptr addrspace(1) %out, i64 %src0) { +; GFX10-SDAG-LABEL: v_permlane16_b32_vvv_i64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlane16_b32_vvv_i64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm ; -; GFX11-LABEL: v_permlanex16_b32_vll: +; GFX11-LABEL: v_permlane16_b32_vvv_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: s_movk_i32 s2, 0x1234 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX11-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_readfirstlane_b32 s5, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: v_readfirstlane_b32 s4, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: v_permlane16_b32 v0, v0, s4, s5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_permlane16_b32 v1, v1, s4, s5 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: v_permlanex16_b32_vll: +; GFX12-LABEL: v_permlane16_b32_vvv_i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX12-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_readfirstlane_b32 s5, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX12-NEXT: s_movk_i32 s2, 0x1234 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 -; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: v_readfirstlane_b32 s4, v1 +; GFX12-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-NEXT: v_permlane16_b32 v0, v0, s4, s5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_permlane16_b32 v1, v1, s4, s5 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm - %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 4660, i32 49617, i1 false, i1 false) - store i32 %v, ptr addrspace(1) %out + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %tidx, i32 %tidy, i1 false, i1 false) + store i64 %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlanex16_b32_vvv(ptr addrspace(1) %out, i32 %src0) { -; GFX10-LABEL: v_permlanex16_b32_vvv: +define amdgpu_kernel void @v_permlane16_b32_vvv_f32(ptr addrspace(1) %out, float %src0) { +; GFX10-LABEL: v_permlane16_b32_vvv_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c @@ -685,11 +1017,11 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv(ptr addrspace(1) %out, i32 %src ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; -; GFX11-SDAG-LABEL: v_permlanex16_b32_vvv: +; GFX11-SDAG-LABEL: v_permlane16_b32_vvv_f32: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 ; GFX11-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x2c @@ -703,13 +1035,13 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv(ptr addrspace(1) %out, i32 %src ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s4 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; -; GFX11-GISEL-LABEL: v_permlanex16_b32_vvv: +; GFX11-GISEL-LABEL: v_permlane16_b32_vvv_f32: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 ; GFX11-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c @@ -722,13 +1054,13 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv(ptr addrspace(1) %out, i32 %src ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-GISEL-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; -; GFX12-SDAG-LABEL: v_permlanex16_b32_vvv: +; GFX12-SDAG-LABEL: v_permlane16_b32_vvv_f32: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 @@ -740,13 +1072,13 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv(ptr addrspace(1) %out, i32 %src ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; -; GFX12-GISEL-LABEL: v_permlanex16_b32_vvv: +; GFX12-GISEL-LABEL: v_permlane16_b32_vvv_f32: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 @@ -757,42 +1089,117 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv(ptr addrspace(1) %out, i32 %src ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s3, v1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidy = call i32 @llvm.amdgcn.workitem.id.y() - %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %tidx, i32 %tidy, i1 false, i1 false) - store i32 %v, ptr addrspace(1) %out + %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %tidx, i32 %tidy, i1 false, i1 false) + store float %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_vvv_f64(ptr addrspace(1) %out, double %src0) { +; GFX10-SDAG-LABEL: v_permlane16_b32_vvv_f64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlane16_b32_vvv_f64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlane16_b32_vvv_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX11-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_readfirstlane_b32 s5, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: v_readfirstlane_b32 s4, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: v_permlane16_b32 v0, v0, s4, s5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_permlane16_b32 v1, v1, s4, s5 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlane16_b32_vvv_f64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX12-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_readfirstlane_b32 s5, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: v_readfirstlane_b32 s4, v1 +; GFX12-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-NEXT: v_permlane16_b32 v0, v0, s4, s5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_permlane16_b32 v1, v1, s4, s5 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %tidx, i32 %tidy, i1 false, i1 false) + store double %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlanex16_b32_vvs(ptr addrspace(1) %out, i32 %src0, i32 %src2) { -; GFX10-SDAG-LABEL: v_permlanex16_b32_vvs: +define amdgpu_kernel void @v_permlane16_b32_vvs_i32(ptr addrspace(1) %out, i32 %src0, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_b32_vvs_i32: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-SDAG-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s7 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s7 ; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; -; GFX10-GISEL-LABEL: v_permlanex16_b32_vvs: +; GFX10-GISEL-LABEL: v_permlane16_b32_vvs_i32: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s7 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s7 ; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; -; GFX11-SDAG-LABEL: v_permlanex16_b32_vvs: +; GFX11-SDAG-LABEL: v_permlane16_b32_vvs_i32: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -800,13 +1207,13 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs(ptr addrspace(1) %out, i32 %src ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; -; GFX11-GISEL-LABEL: v_permlanex16_b32_vvs: +; GFX11-GISEL-LABEL: v_permlane16_b32_vvs_i32: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 @@ -814,13 +1221,13 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs(ptr addrspace(1) %out, i32 %src ; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s3 +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s3 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; -; GFX12-SDAG-LABEL: v_permlanex16_b32_vvs: +; GFX12-SDAG-LABEL: v_permlane16_b32_vvs_i32: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 @@ -828,13 +1235,13 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs(ptr addrspace(1) %out, i32 %src ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; -; GFX12-GISEL-LABEL: v_permlanex16_b32_vvs: +; GFX12-GISEL-LABEL: v_permlane16_b32_vvs_i32: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 @@ -842,867 +1249,8760 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs(ptr addrspace(1) %out, i32 %src ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s3 +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s3 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %tidx, i32 %src2, i1 false, i1 false) + %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %tidx, i32 %src2, i1 false, i1 false) store i32 %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlanex16_b32_vsv(ptr addrspace(1) %out, i32 %src0, i32 %src1) { -; GFX10-LABEL: v_permlanex16_b32_vsv: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: v_readfirstlane_b32 s0, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] -; GFX10-NEXT: s_endpgm +define amdgpu_kernel void @v_permlane16_b32_vvs_i64(ptr addrspace(1) %out, i64 %src0, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_b32_vvs_i64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s1, v0 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s1, s0 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s1, s0 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm ; -; GFX11-SDAG-LABEL: v_permlanex16_b32_vsv: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 -; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm +; GFX10-GISEL-LABEL: v_permlane16_b32_vvs_i64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s1, s0 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s1, s0 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm ; -; GFX11-GISEL-LABEL: v_permlanex16_b32_vsv: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 -; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm +; GFX11-LABEL: v_permlane16_b32_vvs_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-NEXT: v_permlane16_b32 v1, v1, s1, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_permlane16_b32 v0, v0, s1, s0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlane16_b32_vvs_i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_readfirstlane_b32 s1, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-NEXT: v_permlane16_b32 v1, v1, s1, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_permlane16_b32 v0, v0, s1, s0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %tidx, i32 %src2, i1 false, i1 false) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_vvs_f32(ptr addrspace(1) %out, float %src0, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_b32_vvs_f32: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s7 +; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlane16_b32_vvs_f32: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s7 +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlane16_b32_vvs_f32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlane16_b32_vvs_f32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s3 +; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm ; -; GFX12-SDAG-LABEL: v_permlanex16_b32_vsv: +; GFX12-SDAG-LABEL: v_permlane16_b32_vvs_f32: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; -; GFX12-GISEL-LABEL: v_permlanex16_b32_vsv: +; GFX12-GISEL-LABEL: v_permlane16_b32_vvs_f32: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s3 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm - %tidy = call i32 @llvm.amdgcn.workitem.id.y() - %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %tidy, i1 false, i1 false) - store i32 %v, ptr addrspace(1) %out + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %tidx, i32 %src2, i1 false, i1 false) + store float %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlanex16_b32_vss_fi(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlanex16_b32_vss_fi: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,0] -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] -; GFX10-NEXT: s_endpgm +define amdgpu_kernel void @v_permlane16_b32_vvs_f64(ptr addrspace(1) %out, double %src0, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_b32_vvs_f64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s1, v0 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s1, s0 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s1, s0 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlane16_b32_vvs_f64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s1, s0 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s1, s0 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm ; -; GFX11-LABEL: v_permlanex16_b32_vss_fi: +; GFX11-LABEL: v_permlane16_b32_vvs_f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 ; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,0] -; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-NEXT: v_permlane16_b32 v1, v1, s1, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_permlane16_b32 v0, v0, s1, s0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: v_permlanex16_b32_vss_fi: +; GFX12-LABEL: v_permlane16_b32_vvs_f64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 ; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,0] -; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_readfirstlane_b32 s1, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-NEXT: v_permlane16_b32 v1, v1, s1, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_permlane16_b32 v0, v0, s1, s0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm - %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 false) - store i32 %v, ptr addrspace(1) %out + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %tidx, i32 %src2, i1 false, i1 false) + store double %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlanex16_b32_vss_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlanex16_b32_vss_bc: +define amdgpu_kernel void @v_permlane16_b32_vsv_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1) { +; GFX10-LABEL: v_permlane16_b32_vsv_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: v_readfirstlane_b32 s0, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[0,1] +; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s0 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: v_permlanex16_b32_vss_bc: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[0,1] -; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-SDAG-LABEL: v_permlane16_b32_vsv_i32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm ; -; GFX12-LABEL: v_permlanex16_b32_vss_bc: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[0,1] -; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm - %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 true) +; GFX11-GISEL-LABEL: v_permlane16_b32_vsv_i32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 +; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlane16_b32_vsv_i32: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 +; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16_b32_vsv_i32: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 +; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %tidy, i1 false, i1 false) store i32 %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlanex16_b32_vss_fi_bc: +define amdgpu_kernel void @v_permlane16_b32_vsv_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1) { +; GFX10-SDAG-LABEL: v_permlane16_b32_vsv_i64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlane16_b32_vsv_i64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlane16_b32_vsv_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v0, s6 +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlane16_b32_vsv_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlane16_b32_vsv_i64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v0, s6 +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16_b32_vsv_i64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %tidy, i1 false, i1 false) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_vsv_f32(ptr addrspace(1) %out, float %src0, i32 %src1) { +; GFX10-LABEL: v_permlane16_b32_vsv_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: v_readfirstlane_b32 s0, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,1] +; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s0 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: v_permlanex16_b32_vss_fi_bc: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,1] -; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-SDAG-LABEL: v_permlane16_b32_vsv_f32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm ; -; GFX12-LABEL: v_permlanex16_b32_vss_fi_bc: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,1] -; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm - %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 true) - store i32 %v, ptr addrspace(1) %out +; GFX11-GISEL-LABEL: v_permlane16_b32_vsv_f32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 +; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlane16_b32_vsv_f32: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 +; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16_b32_vsv_f32: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 +; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %src1, i32 %tidy, i1 false, i1 false) + store float %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlane16_b32_tid_tid(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlane16_b32_tid_tid: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] -; GFX10-NEXT: s_endpgm +define amdgpu_kernel void @v_permlane16_b32_vsv_f64(ptr addrspace(1) %out, double %src0, i32 %src1) { +; GFX10-SDAG-LABEL: v_permlane16_b32_vsv_f64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm ; -; GFX11-LABEL: v_permlane16_b32_tid_tid: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX10-GISEL-LABEL: v_permlane16_b32_vsv_f64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm ; -; GFX12-LABEL: v_permlane16_b32_tid_tid: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %v = call i32 @llvm.amdgcn.permlane16(i32 %tidx, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false) - store i32 %v, ptr addrspace(1) %out +; GFX11-SDAG-LABEL: v_permlane16_b32_vsv_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v0, s6 +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlane16_b32_vsv_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlane16_b32_vsv_f64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v0, s6 +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16_b32_vsv_f64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %src1, i32 %tidy, i1 false, i1 false) + store double %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlane16_b32_undef_tid(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlane16_b32_undef_tid: +define amdgpu_kernel void @v_permlane16_b32_vss_fi_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlane16_b32_vss_fi_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,0] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: v_permlane16_b32_undef_tid: +; GFX11-LABEL: v_permlane16_b32_vss_fi_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,0] +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: v_permlane16_b32_undef_tid: +; GFX12-LABEL: v_permlane16_b32_vss_fi_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,0] +; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %undef = freeze i32 poison - %v = call i32 @llvm.amdgcn.permlane16(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false) + %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 false) store i32 %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlane16_b32_i_tid(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid: +define amdgpu_kernel void @v_permlane16_b32_vss_fi_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_b32_vss_fi_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x3039 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v0, s0, s1 -; GFX10-SDAG-NEXT: global_store_dword v2, v1, s[4:5] +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; -; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid: +; GFX10-GISEL-LABEL: v_permlane16_b32_vss_fi_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v0, s0, s1 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; -; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid: +; GFX11-SDAG-LABEL: v_permlane16_b32_vss_fi_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v0, s0, s1 -; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[2:3] +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; -; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid: +; GFX11-GISEL-LABEL: v_permlane16_b32_vss_fi_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v0, s0, s1 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; -; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid: +; GFX12-SDAG-LABEL: v_permlane16_b32_vss_fi_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v0, s0, s1 -; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[2:3] +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; -; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid: +; GFX12-GISEL-LABEL: v_permlane16_b32_vss_fi_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v0, s0, s1 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %v = call i32 @llvm.amdgcn.permlane16(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false) - store i32 %v, ptr addrspace(1) %out + %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 true, i1 false) + store i64 %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlane16_b32_i_tid_fi(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlane16_b32_i_tid_fi: +define amdgpu_kernel void @v_permlane16_b32_vss_fi_f32(ptr addrspace(1) %out, float %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlane16_b32_vss_fi_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,0] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: v_permlane16_b32_i_tid_fi: +; GFX11-LABEL: v_permlane16_b32_vss_fi_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,0] +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: v_permlane16_b32_i_tid_fi: +; GFX12-LABEL: v_permlane16_b32_vss_fi_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,0] +; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %undef = freeze i32 poison - %v = call i32 @llvm.amdgcn.permlane16(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 true, i1 false) - store i32 %v, ptr addrspace(1) %out + %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 true, i1 false) + store float %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlane16_b32_i_tid_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlane16_b32_i_tid_bc: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] -; GFX10-NEXT: s_endpgm +define amdgpu_kernel void @v_permlane16_b32_vss_fi_f64(ptr addrspace(1) %out, double %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_b32_vss_fi_f64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm ; -; GFX11-LABEL: v_permlane16_b32_i_tid_bc: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX10-GISEL-LABEL: v_permlane16_b32_vss_fi_f64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm ; -; GFX12-LABEL: v_permlane16_b32_i_tid_bc: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %undef = freeze i32 poison - %v = call i32 @llvm.amdgcn.permlane16(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 true) - store i32 %v, ptr addrspace(1) %out +; GFX11-SDAG-LABEL: v_permlane16_b32_vss_fi_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlane16_b32_vss_fi_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlane16_b32_vss_fi_f64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16_b32_vss_fi_f64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 true, i1 false) + store double %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlane16_b32_i_tid_fi_bc: +define amdgpu_kernel void @v_permlane16_b32_vss_bc_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlane16_b32_vss_bc_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[0,1] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: v_permlane16_b32_i_tid_fi_bc: +; GFX11-LABEL: v_permlane16_b32_vss_bc_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[0,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: v_permlane16_b32_i_tid_fi_bc: +; GFX12-LABEL: v_permlane16_b32_vss_bc_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[0,1] +; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %undef = freeze i32 poison - %v = call i32 @llvm.amdgcn.permlane16(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 true, i1 true) + %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 true) store i32 %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlanex16_b32_tid_tid(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlanex16_b32_tid_tid: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] -; GFX10-NEXT: s_endpgm +define amdgpu_kernel void @v_permlane16_b32_vss_bc_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_b32_vss_bc_i64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm ; -; GFX11-LABEL: v_permlanex16_b32_tid_tid: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX10-GISEL-LABEL: v_permlane16_b32_vss_bc_i64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm ; -; GFX12-LABEL: v_permlanex16_b32_tid_tid: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %v = call i32 @llvm.amdgcn.permlanex16(i32 %tidx, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false) - store i32 %v, ptr addrspace(1) %out +; GFX11-SDAG-LABEL: v_permlane16_b32_vss_bc_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlane16_b32_vss_bc_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlane16_b32_vss_bc_i64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16_b32_vss_bc_i64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 false, i1 true) + store i64 %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlanex16_b32_undef_tid(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlanex16_b32_undef_tid: +define amdgpu_kernel void @v_permlane16_b32_vss_bc_f32(ptr addrspace(1) %out, float %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlane16_b32_vss_bc_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[0,1] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: v_permlanex16_b32_undef_tid: +; GFX11-LABEL: v_permlane16_b32_vss_bc_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[0,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: v_permlanex16_b32_undef_tid: +; GFX12-LABEL: v_permlane16_b32_vss_bc_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[0,1] +; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %undef = freeze i32 poison - %v = call i32 @llvm.amdgcn.permlanex16(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false) - store i32 %v, ptr addrspace(1) %out + %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 false, i1 true) + store float %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlanex16_b32_i_tid(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid: +define amdgpu_kernel void @v_permlane16_b32_vss_bc_f64(ptr addrspace(1) %out, double %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_b32_vss_bc_f64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x3039 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v0, s0, s1 -; GFX10-SDAG-NEXT: global_store_dword v2, v1, s[4:5] +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; -; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid: +; GFX10-GISEL-LABEL: v_permlane16_b32_vss_bc_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v0, s0, s1 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; -; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid: +; GFX11-SDAG-LABEL: v_permlane16_b32_vss_bc_f64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v0, s0, s1 -; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[2:3] +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; -; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid: +; GFX11-GISEL-LABEL: v_permlane16_b32_vss_bc_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v0, s0, s1 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; -; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid: +; GFX12-SDAG-LABEL: v_permlane16_b32_vss_bc_f64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v0, s0, s1 -; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[2:3] +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; -; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid: +; GFX12-GISEL-LABEL: v_permlane16_b32_vss_bc_f64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v0, s0, s1 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %v = call i32 @llvm.amdgcn.permlanex16(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false) - store i32 %v, ptr addrspace(1) %out + %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 false, i1 true) + store double %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlanex16_b32_i_tid_fi: +define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlane16_b32_vss_fi_bc_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,1] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: v_permlanex16_b32_i_tid_fi: +; GFX11-LABEL: v_permlane16_b32_vss_fi_bc_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: v_permlanex16_b32_i_tid_fi: +; GFX12-LABEL: v_permlane16_b32_vss_fi_bc_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,1] +; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %undef = freeze i32 poison - %v = call i32 @llvm.amdgcn.permlanex16(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 true, i1 false) + %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 true) store i32 %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlanex16_b32_i_tid_bc: +define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_b32_vss_fi_bc_i64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlane16_b32_vss_fi_bc_i64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlane16_b32_vss_fi_bc_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlane16_b32_vss_fi_bc_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlane16_b32_vss_fi_bc_i64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16_b32_vss_fi_bc_i64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 true, i1 true) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_f32(ptr addrspace(1) %out, float %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlane16_b32_vss_fi_bc_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,1] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: v_permlanex16_b32_i_tid_bc: +; GFX11-LABEL: v_permlane16_b32_vss_fi_bc_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: v_permlanex16_b32_i_tid_bc: +; GFX12-LABEL: v_permlane16_b32_vss_fi_bc_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,1] +; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %undef = freeze i32 poison - %v = call i32 @llvm.amdgcn.permlanex16(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 true) - store i32 %v, ptr addrspace(1) %out + %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 true, i1 true) + store float %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_f64(ptr addrspace(1) %out, double %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_b32_vss_fi_bc_f64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlane16_b32_vss_fi_bc_f64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlane16_b32_vss_fi_bc_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlane16_b32_vss_fi_bc_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlane16_b32_vss_fi_bc_f64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16_b32_vss_fi_bc_f64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 true, i1 true) + store double %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlanex16_b32_i_tid_fi_bc: +define amdgpu_kernel void @v_permlanex16_b32_vss_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlanex16_b32_vss_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s0 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: v_permlanex16_b32_i_tid_fi_bc: +; GFX11-LABEL: v_permlanex16_b32_vss_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s7, s0 +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: v_permlanex16_b32_i_tid_fi_bc: +; GFX12-LABEL: v_permlanex16_b32_vss_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s7, s0 +; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %undef = freeze i32 poison - %v = call i32 @llvm.amdgcn.permlanex16(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 true, i1 true) + %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) store i32 %v, ptr addrspace(1) %out ret void } + +define amdgpu_kernel void @v_permlanex16_b32_vss_f32(ptr addrspace(1) %out, float %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlanex16_b32_vss_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s0 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlanex16_b32_vss_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s7, s0 +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlanex16_b32_vss_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s7, s0 +; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store float %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_vss_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_i64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_i64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_i64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_i64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_vss_f64(ptr addrspace(1) %out, double %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_f64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_f64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_f64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_f64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store double %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_vii_i32(ptr addrspace(1) %out, i32 %src0) { +; GFX10-LABEL: v_permlanex16_b32_vii_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_permlanex16_b32 v0, v0, 1, 2 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlanex16_b32_vii_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlanex16_b32 v0, v0, 1, 2 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlanex16_b32_vii_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlanex16_b32 v0, v0, 1, 2 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 1, i32 2, i1 false, i1 false) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_vii_f32(ptr addrspace(1) %out, float %src0) { +; GFX10-LABEL: v_permlanex16_b32_vii_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_permlanex16_b32 v0, v0, 1, 2 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlanex16_b32_vii_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlanex16_b32 v0, v0, 1, 2 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlanex16_b32_vii_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlanex16_b32 v0, v0, 1, 2 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 1, i32 2, i1 false, i1 false) + store float %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_vii_i64(ptr addrspace(1) %out, i64 %src0) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_vii_i64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, 1, 2 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, 1, 2 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_vii_i64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, 1, 2 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, 1, 2 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlanex16_b32_vii_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, 1, 2 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, 1, 2 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlanex16_b32_vii_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, 1, 2 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, 1, 2 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_vii_i64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, 1, 2 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, 1, 2 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_vii_i64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, 1, 2 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, 1, 2 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 1, i32 2, i1 false, i1 false) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_vii_f64(ptr addrspace(1) %out, double %src0) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_vii_f64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, 1, 2 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, 1, 2 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_vii_f64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, 1, 2 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, 1, 2 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlanex16_b32_vii_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, 1, 2 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, 1, 2 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlanex16_b32_vii_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, 1, 2 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, 1, 2 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_vii_f64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, 1, 2 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, 1, 2 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_vii_f64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, 1, 2 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, 1, 2 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 1, i32 2, i1 false, i1 false) + store double %v, ptr addrspace(1) %out + ret void +} + +; FIXME-GFX10PLUS: It is allowed to have both immediates as literals +define amdgpu_kernel void @v_permlanex16_b32_vll_i32(ptr addrspace(1) %out, i32 %src0) { +; GFX10-LABEL: v_permlanex16_b32_vll_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_movk_i32 s2, 0x1234 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlanex16_b32_vll_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_movk_i32 s2, 0x1234 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlanex16_b32_vll_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX12-NEXT: s_movk_i32 s2, 0x1234 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 4660, i32 49617, i1 false, i1 false) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_vll_f32(ptr addrspace(1) %out, float %src0) { +; GFX10-LABEL: v_permlanex16_b32_vll_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_movk_i32 s2, 0x1234 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlanex16_b32_vll_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_movk_i32 s2, 0x1234 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlanex16_b32_vll_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX12-NEXT: s_movk_i32 s2, 0x1234 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 4660, i32 49617, i1 false, i1 false) + store float %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_vll_i64(ptr addrspace(1) %out, i64 %src0) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_vll_i64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_movk_i32 s0, 0x1234 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, 0xc1d1 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, 0xc1d1 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_vll_i64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_movk_i32 s0, 0x1234 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, 0xc1d1 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, 0xc1d1 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlanex16_b32_vll_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-SDAG-NEXT: s_movk_i32 s2, 0x1234 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlanex16_b32_vll_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-GISEL-NEXT: s_movk_i32 s2, 0x1234 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_vll_i64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x1234 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_vll_i64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-GISEL-NEXT: s_movk_i32 s2, 0x1234 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 4660, i32 49617, i1 false, i1 false) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_vll_f64(ptr addrspace(1) %out, double %src0) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_vll_f64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_movk_i32 s0, 0x1234 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, 0xc1d1 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, 0xc1d1 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_vll_f64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_movk_i32 s0, 0x1234 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, 0xc1d1 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, 0xc1d1 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlanex16_b32_vll_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-SDAG-NEXT: s_movk_i32 s2, 0x1234 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlanex16_b32_vll_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-GISEL-NEXT: s_movk_i32 s2, 0x1234 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_vll_f64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x1234 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_vll_f64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-GISEL-NEXT: s_movk_i32 s2, 0x1234 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 4660, i32 49617, i1 false, i1 false) + store double %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_vvv_i32(ptr addrspace(1) %out, i32 %src0) { +; GFX10-LABEL: v_permlanex16_b32_vvv_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_mov_b32 null, 0 +; GFX10-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10-NEXT: v_readfirstlane_b32 s3, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlanex16_b32_vvv_i32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlanex16_b32_vvv_i32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_vvv_i32: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s3, v1 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 +; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_vvv_i32: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s3, v1 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 +; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %tidx, i32 %tidy, i1 false, i1 false) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_vvv_f32(ptr addrspace(1) %out, float %src0) { +; GFX10-LABEL: v_permlanex16_b32_vvv_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_mov_b32 null, 0 +; GFX10-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10-NEXT: v_readfirstlane_b32 s3, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlanex16_b32_vvv_f32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlanex16_b32_vvv_f32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_vvv_f32: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s3, v1 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 +; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_vvv_f32: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s3, v1 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 +; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %tidx, i32 %tidy, i1 false, i1 false) + store float %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_vvv_i64(ptr addrspace(1) %out, i64 %src0) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_vvv_i64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_vvv_i64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlanex16_b32_vvv_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX11-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_readfirstlane_b32 s5, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: v_readfirstlane_b32 s4, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s4, s5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_permlanex16_b32 v1, v1, s4, s5 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlanex16_b32_vvv_i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX12-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_readfirstlane_b32 s5, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: v_readfirstlane_b32 s4, v1 +; GFX12-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s4, s5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_permlanex16_b32 v1, v1, s4, s5 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %tidx, i32 %tidy, i1 false, i1 false) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_vvv_f64(ptr addrspace(1) %out, double %src0) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_vvv_f64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_vvv_f64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlanex16_b32_vvv_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX11-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_readfirstlane_b32 s5, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: v_readfirstlane_b32 s4, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s4, s5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_permlanex16_b32 v1, v1, s4, s5 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlanex16_b32_vvv_f64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX12-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_readfirstlane_b32 s5, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: v_readfirstlane_b32 s4, v1 +; GFX12-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s4, s5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_permlanex16_b32 v1, v1, s4, s5 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %tidx, i32 %tidy, i1 false, i1 false) + store double %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_vvs_i32(ptr addrspace(1) %out, i32 %src0, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_vvs_i32: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s7 +; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_vvs_i32: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s7 +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlanex16_b32_vvs_i32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlanex16_b32_vvs_i32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s3 +; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_vvs_i32: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_vvs_i32: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s3 +; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %tidx, i32 %src2, i1 false, i1 false) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_vvs_f32(ptr addrspace(1) %out, float %src0, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_vvs_f32: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s7 +; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_vvs_f32: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s7 +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlanex16_b32_vvs_f32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlanex16_b32_vvs_f32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s3 +; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_vvs_f32: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_vvs_f32: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s3 +; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %tidx, i32 %src2, i1 false, i1 false) + store float %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_vvs_i64(ptr addrspace(1) %out, i64 %src0, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_vvs_i64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s1, v0 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s1, s0 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s1, s0 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_vvs_i64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s1, s0 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s1, s0 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlanex16_b32_vvs_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-NEXT: v_permlanex16_b32 v1, v1, s1, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s1, s0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlanex16_b32_vvs_i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_readfirstlane_b32 s1, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-NEXT: v_permlanex16_b32 v1, v1, s1, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s1, s0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %tidx, i32 %src2, i1 false, i1 false) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_vvs_f64(ptr addrspace(1) %out, double %src0, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_vvs_f64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s1, v0 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s1, s0 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s1, s0 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_vvs_f64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s1, s0 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s1, s0 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlanex16_b32_vvs_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-NEXT: v_permlanex16_b32 v1, v1, s1, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s1, s0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlanex16_b32_vvs_f64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_readfirstlane_b32 s1, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-NEXT: v_permlanex16_b32 v1, v1, s1, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s1, s0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %tidx, i32 %src2, i1 false, i1 false) + store double %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_vsv_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1) { +; GFX10-LABEL: v_permlanex16_b32_vsv_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s0 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlanex16_b32_vsv_i32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlanex16_b32_vsv_i32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 +; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_vsv_i32: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 +; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_vsv_i32: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 +; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %tidy, i1 false, i1 false) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_vsv_f32(ptr addrspace(1) %out, float %src0, i32 %src1) { +; GFX10-LABEL: v_permlanex16_b32_vsv_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s0 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlanex16_b32_vsv_f32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlanex16_b32_vsv_f32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 +; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_vsv_f32: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 +; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_vsv_f32: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 +; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %src1, i32 %tidy, i1 false, i1 false) + store float %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_vsv_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_vsv_i64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_vsv_i64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlanex16_b32_vsv_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v0, s6 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlanex16_b32_vsv_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_vsv_i64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v0, s6 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_vsv_i64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %tidy, i1 false, i1 false) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_vsv_f64(ptr addrspace(1) %out, double %src0, i32 %src1) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_vsv_f64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_vsv_f64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlanex16_b32_vsv_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v0, s6 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlanex16_b32_vsv_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_vsv_f64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v0, s6 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_vsv_f64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %src1, i32 %tidy, i1 false, i1 false) + store double %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_vss_fi_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlanex16_b32_vss_fi_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,0] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlanex16_b32_vss_fi_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,0] +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlanex16_b32_vss_fi_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,0] +; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 false) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_vss_fi_f32(ptr addrspace(1) %out, float %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlanex16_b32_vss_fi_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,0] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlanex16_b32_vss_fi_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,0] +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlanex16_b32_vss_fi_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,0] +; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 true, i1 false) + store float %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_vss_fi_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_fi_i64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_fi_i64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_fi_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_fi_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_fi_i64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_fi_i64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 true, i1 false) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_vss_fi_f64(ptr addrspace(1) %out, double %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_fi_f64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_fi_f64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_fi_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_fi_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_fi_f64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_fi_f64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 true, i1 false) + store double %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_vss_bc_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlanex16_b32_vss_bc_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[0,1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlanex16_b32_vss_bc_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[0,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlanex16_b32_vss_bc_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[0,1] +; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 true) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_vss_bc_f32(ptr addrspace(1) %out, float %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlanex16_b32_vss_bc_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[0,1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlanex16_b32_vss_bc_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[0,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlanex16_b32_vss_bc_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[0,1] +; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 false, i1 true) + store float %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_vss_bc_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_bc_i64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_bc_i64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_bc_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_bc_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_bc_i64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_bc_i64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 false, i1 true) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_vss_bc_f64(ptr addrspace(1) %out, double %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_bc_f64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_bc_f64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_bc_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_bc_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_bc_f64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_bc_f64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 false, i1 true) + store double %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlanex16_b32_vss_fi_bc_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlanex16_b32_vss_fi_bc_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlanex16_b32_vss_fi_bc_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,1] +; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 true) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_f32(ptr addrspace(1) %out, float %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlanex16_b32_vss_fi_bc_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlanex16_b32_vss_fi_bc_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlanex16_b32_vss_fi_bc_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,1] +; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 true, i1 true) + store float %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_fi_bc_i64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_fi_bc_i64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_fi_bc_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_fi_bc_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_fi_bc_i64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_fi_bc_i64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 true, i1 true) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_f64(ptr addrspace(1) %out, double %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_fi_bc_f64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_fi_bc_f64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_fi_bc_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_fi_bc_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_fi_bc_f64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_fi_bc_f64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 true, i1 true) + store double %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_tid_tid_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlane16_b32_tid_tid_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlane16_b32_tid_tid_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlane16_b32_tid_tid_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %tidx, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_tid_tid_f32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlane16_b32_tid_tid_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlane16_b32_tid_tid_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlane16_b32_tid_tid_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_f32 = bitcast i32 %tidx to float + %v = call float @llvm.amdgcn.permlane16.f32(float %tidx_f32, float %tidx_f32, i32 %src1, i32 %src2, i1 false, i1 false) + store float %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_tid_tid_i64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_b32_tid_tid_i64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlane16_b32_tid_tid_i64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlane16_b32_tid_tid_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlane16_b32_tid_tid_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlane16_b32_tid_tid_i64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16_b32_tid_tid_i64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_i64 = zext i32 %tidx to i64 + %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %tidx_i64, i64 %tidx_i64, i32 %src1, i32 %src2, i1 false, i1 false) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_tid_tid_f64(ptr addrspace(1) %out, float %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_b32_tid_tid_f64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlane16_b32_tid_tid_f64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlane16_b32_tid_tid_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlane16_b32_tid_tid_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlane16_b32_tid_tid_f64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16_b32_tid_tid_f64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_f32 = bitcast i32 %tidx to float + %tidx_f64 = fpext float %tidx_f32 to double + %v = call double @llvm.amdgcn.permlane16.f64(double %tidx_f64, double %tidx_f64, i32 %src1, i32 %src2, i1 false, i1 false) + store double %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_undef_tid_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlane16_b32_undef_tid_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlane16_b32_undef_tid_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlane16_b32_undef_tid_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %undef = freeze i32 poison + %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_undef_tid_f32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlane16_b32_undef_tid_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlane16_b32_undef_tid_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlane16_b32_undef_tid_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_f32 = bitcast i32 %tidx to float + %undef = freeze float poison + %v = call float @llvm.amdgcn.permlane16.f32(float %undef, float %tidx_f32, i32 %src1, i32 %src2, i1 false, i1 false) + store float %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_undef_tid_i64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_b32_undef_tid_i64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlane16_b32_undef_tid_i64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlane16_b32_undef_tid_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlane16_b32_undef_tid_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlane16_b32_undef_tid_i64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16_b32_undef_tid_i64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_i64 = zext i32 %tidx to i64 + %undef = freeze i64 poison + %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %undef, i64 %tidx_i64, i32 %src1, i32 %src2, i1 false, i1 false) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_undef_tid_f64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_b32_undef_tid_f64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlane16_b32_undef_tid_f64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlane16_b32_undef_tid_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlane16_b32_undef_tid_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlane16_b32_undef_tid_f64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16_b32_undef_tid_f64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_f32 = bitcast i32 %tidx to float + %tidx_f64 = fpext float %tidx_f32 to double + %undef = freeze double poison + %v = call double @llvm.amdgcn.permlane16.f64(double %undef, double %tidx_f64, i32 %src1, i32 %src2, i1 false, i1 false) + store double %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_i_tid_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_i32: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x3039 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v0, s0, s1 +; GFX10-SDAG-NEXT: global_store_dword v2, v1, s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_i32: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v0, s0, s1 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_i32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v0, s0, s1 +; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[2:3] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_i32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v0, s0, s1 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_i32: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v0, s0, s1 +; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[2:3] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_i32: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v0, s0, s1 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %v = call i32 @llvm.amdgcn.permlane16.i32(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_i_tid_f32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_f32: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x449a5000 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v0, s0, s1 +; GFX10-SDAG-NEXT: global_store_dword v2, v1, s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_f32: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x449a5000 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v0, s0, s1 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_f32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0x449a5000 :: v_dual_mov_b32 v2, 0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v0, s0, s1 +; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[2:3] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_f32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x449a5000 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v0, s0, s1 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_f32: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x449a5000 :: v_dual_mov_b32 v2, 0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v0, s0, s1 +; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[2:3] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_f32: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x449a5000 +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v0, s0, s1 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_f32 = bitcast i32 %tidx to float + %v = call float @llvm.amdgcn.permlane16.f32(float 1234.5, float %tidx_f32, i32 %src1, i32 %src2, i1 false, i1 false) + store float %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_i_tid_i64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_i64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x3039 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v0, s0, s1 +; GFX10-SDAG-NEXT: global_store_dwordx2 v3, v[1:2], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_i64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v0, s0, s1 +; GFX10-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX10-GISEL-NEXT: global_store_dwordx2 v3, v[1:2], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x3039 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v0, s0, s1 +; GFX11-SDAG-NEXT: global_store_b64 v3, v[1:2], s[2:3] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v0, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX11-GISEL-NEXT: global_store_b64 v3, v[1:2], s[2:3] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_i64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x3039 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v0, s0, s1 +; GFX12-SDAG-NEXT: global_store_b64 v3, v[1:2], s[2:3] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_i64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v0, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX12-GISEL-NEXT: global_store_b64 v3, v[1:2], s[2:3] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_i64 = zext i32 %tidx to i64 + %v = call i64 @llvm.amdgcn.permlane16.i64(i64 12345, i64 %tidx_i64, i32 %src1, i32 %src2, i1 false, i1 false) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_i_tid_f64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_f64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0x40934a00 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_permlane16_b32 v3, v1, s0, s1 +; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v0, s0, s1 +; GFX10-SDAG-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_f64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0x40934a00 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_permlane16_b32 v2, v0, s0, s1 +; GFX10-GISEL-NEXT: v_permlane16_b32 v3, v1, s0, s1 +; GFX10-GISEL-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0x40934a00 :: v_dual_mov_b32 v2, 0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v3, v1, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v0, s0, s1 +; GFX11-SDAG-NEXT: global_store_b64 v4, v[2:3], s[2:3] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40934a00 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v2, v0, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v3, v1, s0, s1 +; GFX11-GISEL-NEXT: global_store_b64 v4, v[2:3], s[2:3] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_f64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0x40934a00 :: v_dual_mov_b32 v2, 0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v1, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v0, s0, s1 +; GFX12-SDAG-NEXT: global_store_b64 v4, v[2:3], s[2:3] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_f64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40934a00 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v0, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v3, v1, s0, s1 +; GFX12-GISEL-NEXT: global_store_b64 v4, v[2:3], s[2:3] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_f32 = bitcast i32 %tidx to float + %tidx_f64 = fpext float %tidx_f32 to double + %v = call double @llvm.amdgcn.permlane16.f64(double 1234.5, double %tidx_f64, i32 %src1, i32 %src2, i1 false, i1 false) + store double %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlane16_b32_i_tid_fi_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlane16_b32_i_tid_fi_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlane16_b32_i_tid_fi_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %undef = freeze i32 poison + %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 true, i1 false) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_f32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlane16_b32_i_tid_fi_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlane16_b32_i_tid_fi_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlane16_b32_i_tid_fi_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_f32 = bitcast i32 %tidx to float + %undef = freeze float poison + %v = call float @llvm.amdgcn.permlane16.f32(float %undef, float %tidx_f32, i32 %src1, i32 %src2, i1 true, i1 false) + store float %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_i64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_fi_i64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_fi_i64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,0] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_fi_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,0] +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_fi_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,0] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_fi_i64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,0] +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_fi_i64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,0] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_i64 = zext i32 %tidx to i64 + %undef = freeze i64 poison + %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %undef, i64 %tidx_i64, i32 %src1, i32 %src2, i1 true, i1 false) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_f64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_fi_f64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_fi_f64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_fi_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_fi_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_fi_f64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_fi_f64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_f32 = bitcast i32 %tidx to float + %tidx_f64 = fpext float %tidx_f32 to double + %undef = freeze double poison + %v = call double @llvm.amdgcn.permlane16.f64(double %undef, double %tidx_f64, i32 %src1, i32 %src2, i1 true, i1 false) + store double %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlane16_b32_i_tid_bc_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlane16_b32_i_tid_bc_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlane16_b32_i_tid_bc_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %undef = freeze i32 poison + %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 true) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_f32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlane16_b32_i_tid_bc_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlane16_b32_i_tid_bc_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlane16_b32_i_tid_bc_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_f32 = bitcast i32 %tidx to float + %undef = freeze float poison + %v = call float @llvm.amdgcn.permlane16.f32(float %undef, float %tidx_f32, i32 %src1, i32 %src2, i1 false, i1 true) + store float %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_i64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_bc_i64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[0,1] +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_bc_i64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[0,1] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_bc_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[0,1] +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_bc_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[0,1] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_bc_i64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[0,1] +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_bc_i64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[0,1] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_i64 = zext i32 %tidx to i64 + %undef = freeze i64 poison + %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %undef, i64 %tidx_i64, i32 %src1, i32 %src2, i1 false, i1 true) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_f64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_bc_f64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_bc_f64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_bc_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_bc_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_bc_f64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_bc_f64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_f32 = bitcast i32 %tidx to float + %tidx_f64 = fpext float %tidx_f32 to double + %undef = freeze double poison + %v = call double @llvm.amdgcn.permlane16.f64(double %undef, double %tidx_f64, i32 %src1, i32 %src2, i1 false, i1 true) + store double %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlane16_b32_i_tid_fi_bc_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlane16_b32_i_tid_fi_bc_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlane16_b32_i_tid_fi_bc_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %undef = freeze i32 poison + %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 true, i1 true) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_f32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlane16_b32_i_tid_fi_bc_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlane16_b32_i_tid_fi_bc_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlane16_b32_i_tid_fi_bc_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_f32 = bitcast i32 %tidx to float + %undef = freeze float poison + %v = call float @llvm.amdgcn.permlane16.f32(float %undef, float %tidx_f32, i32 %src1, i32 %src2, i1 true, i1 true) + store float %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_i64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_fi_bc_i64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,1] +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_fi_bc_i64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,1] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_fi_bc_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,1] +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_fi_bc_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,1] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_fi_bc_i64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,1] +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_fi_bc_i64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,1] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_i64 = zext i32 %tidx to i64 + %undef = freeze i64 poison + %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %undef, i64 %tidx_i64, i32 %src1, i32 %src2, i1 true, i1 true) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_f64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_fi_bc_f64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_fi_bc_f64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_fi_bc_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_fi_bc_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_fi_bc_f64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_fi_bc_f64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_f32 = bitcast i32 %tidx to float + %tidx_f64 = fpext float %tidx_f32 to double + %undef = freeze double poison + %v = call double @llvm.amdgcn.permlane16.f64(double %undef, double %tidx_f64, i32 %src1, i32 %src2, i1 true, i1 true) + store double %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_tid_tid_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlanex16_b32_tid_tid_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlanex16_b32_tid_tid_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlanex16_b32_tid_tid_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %tidx, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_tid_tid_f32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlanex16_b32_tid_tid_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlanex16_b32_tid_tid_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlanex16_b32_tid_tid_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_f32 = bitcast i32 %tidx to float + %v = call float @llvm.amdgcn.permlanex16.f32(float %tidx_f32, float %tidx_f32, i32 %src1, i32 %src2, i1 false, i1 false) + store float %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_tid_tid_i64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_tid_tid_i64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_tid_tid_i64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlanex16_b32_tid_tid_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlanex16_b32_tid_tid_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_tid_tid_i64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_tid_tid_i64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_i64 = zext i32 %tidx to i64 + %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %tidx_i64, i64 %tidx_i64, i32 %src1, i32 %src2, i1 false, i1 false) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_tid_tid_f64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_tid_tid_f64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_tid_tid_f64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlanex16_b32_tid_tid_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlanex16_b32_tid_tid_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_tid_tid_f64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_tid_tid_f64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_f32 = bitcast i32 %tidx to float + %tidx_f64 = fpext float %tidx_f32 to double + %v = call double @llvm.amdgcn.permlanex16.f64(double %tidx_f64, double %tidx_f64, i32 %src1, i32 %src2, i1 false, i1 false) + store double %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_undef_tid_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlanex16_b32_undef_tid_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlanex16_b32_undef_tid_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlanex16_b32_undef_tid_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %undef = freeze i32 poison + %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_undef_tid_f32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlanex16_b32_undef_tid_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlanex16_b32_undef_tid_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlanex16_b32_undef_tid_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_f32 = bitcast i32 %tidx to float + %undef = freeze float poison + %v = call float @llvm.amdgcn.permlanex16.f32(float %undef, float %tidx_f32, i32 %src1, i32 %src2, i1 false, i1 false) + store float %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_undef_tid_i64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_undef_tid_i64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_undef_tid_i64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlanex16_b32_undef_tid_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlanex16_b32_undef_tid_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_undef_tid_i64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_undef_tid_i64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_i64 = zext i32 %tidx to i64 + %undef = freeze i64 poison + %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %undef, i64 %tidx_i64, i32 %src1, i32 %src2, i1 false, i1 false) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_undef_tid_f64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_undef_tid_f64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_undef_tid_f64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlanex16_b32_undef_tid_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlanex16_b32_undef_tid_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_undef_tid_f64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_undef_tid_f64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_f32 = bitcast i32 %tidx to float + %tidx_f64 = fpext float %tidx_f32 to double + %undef = freeze double poison + %v = call double @llvm.amdgcn.permlanex16.f64(double %undef, double %tidx_f64, i32 %src1, i32 %src2, i1 false, i1 false) + store double %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_i_tid_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_i32: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x3039 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v0, s0, s1 +; GFX10-SDAG-NEXT: global_store_dword v2, v1, s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_i32: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v0, s0, s1 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_i32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v0, s0, s1 +; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[2:3] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_i32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v0, s0, s1 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_i32: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v0, s0, s1 +; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[2:3] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_i32: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v0, s0, s1 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_i_tid_f32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_f32: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x449a5000 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v0, s0, s1 +; GFX10-SDAG-NEXT: global_store_dword v2, v1, s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_f32: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x449a5000 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v0, s0, s1 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_f32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0x449a5000 :: v_dual_mov_b32 v2, 0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v0, s0, s1 +; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[2:3] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_f32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x449a5000 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v0, s0, s1 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_f32: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x449a5000 :: v_dual_mov_b32 v2, 0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v0, s0, s1 +; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[2:3] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_f32: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x449a5000 +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v0, s0, s1 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_f32 = bitcast i32 %tidx to float + %v = call float @llvm.amdgcn.permlanex16.f32(float 1234.5, float %tidx_f32, i32 %src1, i32 %src2, i1 false, i1 false) + store float %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_i_tid_i64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_i64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x3039 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v0, s0, s1 +; GFX10-SDAG-NEXT: global_store_dwordx2 v3, v[1:2], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_i64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v0, s0, s1 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX10-GISEL-NEXT: global_store_dwordx2 v3, v[1:2], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x3039 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v0, s0, s1 +; GFX11-SDAG-NEXT: global_store_b64 v3, v[1:2], s[2:3] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v0, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX11-GISEL-NEXT: global_store_b64 v3, v[1:2], s[2:3] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_i64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x3039 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v0, s0, s1 +; GFX12-SDAG-NEXT: global_store_b64 v3, v[1:2], s[2:3] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_i64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v0, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX12-GISEL-NEXT: global_store_b64 v3, v[1:2], s[2:3] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_i64 = zext i32 %tidx to i64 + %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 12345, i64 %tidx_i64, i32 %src1, i32 %src2, i1 false, i1 false) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_i_tid_f64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_f64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0x40934a00 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_permlanex16_b32 v3, v1, s0, s1 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v0, s0, s1 +; GFX10-SDAG-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_f64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0x40934a00 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_permlanex16_b32 v2, v0, s0, s1 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v3, v1, s0, s1 +; GFX10-GISEL-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0x40934a00 :: v_dual_mov_b32 v2, 0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v3, v1, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v0, s0, s1 +; GFX11-SDAG-NEXT: global_store_b64 v4, v[2:3], s[2:3] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40934a00 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v2, v0, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v3, v1, s0, s1 +; GFX11-GISEL-NEXT: global_store_b64 v4, v[2:3], s[2:3] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_f64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0x40934a00 :: v_dual_mov_b32 v2, 0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v1, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v0, s0, s1 +; GFX12-SDAG-NEXT: global_store_b64 v4, v[2:3], s[2:3] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_f64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40934a00 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v2, v0, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v3, v1, s0, s1 +; GFX12-GISEL-NEXT: global_store_b64 v4, v[2:3], s[2:3] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_f32 = bitcast i32 %tidx to float + %tidx_f64 = fpext float %tidx_f32 to double + %v = call double @llvm.amdgcn.permlanex16.f64(double 1234.5, double %tidx_f64, i32 %src1, i32 %src2, i1 false, i1 false) + store double %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlanex16_b32_i_tid_fi_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlanex16_b32_i_tid_fi_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlanex16_b32_i_tid_fi_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %undef = freeze i32 poison + %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 true, i1 false) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_f32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlanex16_b32_i_tid_fi_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlanex16_b32_i_tid_fi_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlanex16_b32_i_tid_fi_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_f32 = bitcast i32 %tidx to float + %undef = freeze float poison + %v = call float @llvm.amdgcn.permlanex16.f32(float %undef, float %tidx_f32, i32 %src1, i32 %src2, i1 true, i1 false) + store float %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_i64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_i64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_i64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,0] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,0] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,0] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_i64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,0] +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_i64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,0] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_i64 = zext i32 %tidx to i64 + %undef = freeze i64 poison + %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %undef, i64 %tidx_i64, i32 %src1, i32 %src2, i1 true, i1 false) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_f64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_f64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_f64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_f64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_f64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_f32 = bitcast i32 %tidx to float + %tidx_f64 = fpext float %tidx_f32 to double + %undef = freeze double poison + %v = call double @llvm.amdgcn.permlanex16.f64(double %undef, double %tidx_f64, i32 %src1, i32 %src2, i1 true, i1 false) + store double %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlanex16_b32_i_tid_bc_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlanex16_b32_i_tid_bc_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlanex16_b32_i_tid_bc_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %undef = freeze i32 poison + %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 true) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_f32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlanex16_b32_i_tid_bc_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlanex16_b32_i_tid_bc_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlanex16_b32_i_tid_bc_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_f32 = bitcast i32 %tidx to float + %undef = freeze float poison + %v = call float @llvm.amdgcn.permlanex16.f32(float %undef, float %tidx_f32, i32 %src1, i32 %src2, i1 false, i1 true) + store float %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_i64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_bc_i64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[0,1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_bc_i64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[0,1] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_bc_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[0,1] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_bc_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[0,1] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_bc_i64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[0,1] +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_bc_i64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[0,1] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_i64 = zext i32 %tidx to i64 + %undef = freeze i64 poison + %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %undef, i64 %tidx_i64, i32 %src1, i32 %src2, i1 false, i1 true) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_f64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_bc_f64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_bc_f64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_bc_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_bc_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_bc_f64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_bc_f64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_f32 = bitcast i32 %tidx to float + %tidx_f64 = fpext float %tidx_f32 to double + %undef = freeze double poison + %v = call double @llvm.amdgcn.permlanex16.f64(double %undef, double %tidx_f64, i32 %src1, i32 %src2, i1 false, i1 true) + store double %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlanex16_b32_i_tid_fi_bc_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlanex16_b32_i_tid_fi_bc_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlanex16_b32_i_tid_fi_bc_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %undef = freeze i32 poison + %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 true, i1 true) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_f32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlanex16_b32_i_tid_fi_bc_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlanex16_b32_i_tid_fi_bc_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlanex16_b32_i_tid_fi_bc_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_f32 = bitcast i32 %tidx to float + %undef = freeze float poison + %v = call float @llvm.amdgcn.permlanex16.f32(float %undef, float %tidx_f32, i32 %src1, i32 %src2, i1 true, i1 true) + store float %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_i64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_bc_i64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_bc_i64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,1] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_bc_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,1] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_bc_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,1] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_bc_i64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,1] +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_bc_i64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,1] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_i64 = zext i32 %tidx to i64 + %undef = freeze i64 poison + %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %undef, i64 %tidx_i64, i32 %src1, i32 %src2, i1 true, i1 true) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_f64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_bc_f64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_bc_f64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_bc_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_bc_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_bc_f64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_bc_f64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_f32 = bitcast i32 %tidx to float + %tidx_f64 = fpext float %tidx_f32 to double + %undef = freeze double poison + %v = call double @llvm.amdgcn.permlanex16.f64(double %undef, double %tidx_f64, i32 %src1, i32 %src2, i1 true, i1 true) + store double %v, ptr addrspace(1) %out + ret void +} + +define void @v_permlane16_half(ptr addrspace(1) %out, half %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlane16_half: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_readfirstlane_b32 s4, v3 +; GFX10-NEXT: v_readfirstlane_b32 s5, v4 +; GFX10-NEXT: v_permlane16_b32 v2, v2, s4, s5 +; GFX10-NEXT: global_store_short v[0:1], v2, off +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_permlane16_half: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-NEXT: v_readfirstlane_b32 s1, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX11-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_permlane16_half: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_readfirstlane_b32 s0, v3 +; GFX12-NEXT: v_readfirstlane_b32 s1, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX12-NEXT: global_store_b16 v[0:1], v2, off +; GFX12-NEXT: s_setpc_b64 s[30:31] + %v = call half @llvm.amdgcn.permlane16.f16(half %src0, half %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store half %v, ptr addrspace(1) %out + ret void +} + +define void @v_permlanex16_half(ptr addrspace(1) %out, half %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlanex16_half: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_readfirstlane_b32 s4, v3 +; GFX10-NEXT: v_readfirstlane_b32 s5, v4 +; GFX10-NEXT: v_permlanex16_b32 v2, v2, s4, s5 +; GFX10-NEXT: global_store_short v[0:1], v2, off +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_permlanex16_half: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-NEXT: v_readfirstlane_b32 s1, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX11-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_permlanex16_half: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_readfirstlane_b32 s0, v3 +; GFX12-NEXT: v_readfirstlane_b32 s1, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX12-NEXT: global_store_b16 v[0:1], v2, off +; GFX12-NEXT: s_setpc_b64 s[30:31] + %v = call half @llvm.amdgcn.permlanex16.f16(half %src0, half %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store half %v, ptr addrspace(1) %out + ret void +} + +define void @v_permlane16_bfloat(ptr addrspace(1) %out, bfloat %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlane16_bfloat: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_readfirstlane_b32 s4, v3 +; GFX10-NEXT: v_readfirstlane_b32 s5, v4 +; GFX10-NEXT: v_permlane16_b32 v2, v2, s4, s5 +; GFX10-NEXT: global_store_short v[0:1], v2, off +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_permlane16_bfloat: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-NEXT: v_readfirstlane_b32 s1, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX11-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_permlane16_bfloat: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_readfirstlane_b32 s0, v3 +; GFX12-NEXT: v_readfirstlane_b32 s1, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX12-NEXT: global_store_b16 v[0:1], v2, off +; GFX12-NEXT: s_setpc_b64 s[30:31] + %v = call bfloat @llvm.amdgcn.permlane16.f16(bfloat %src0, bfloat %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store bfloat %v, ptr addrspace(1) %out + ret void +} + +define void @v_permlanex16_bfloat(ptr addrspace(1) %out, bfloat %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlanex16_bfloat: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_readfirstlane_b32 s4, v3 +; GFX10-NEXT: v_readfirstlane_b32 s5, v4 +; GFX10-NEXT: v_permlanex16_b32 v2, v2, s4, s5 +; GFX10-NEXT: global_store_short v[0:1], v2, off +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_permlanex16_bfloat: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-NEXT: v_readfirstlane_b32 s1, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX11-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_permlanex16_bfloat: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_readfirstlane_b32 s0, v3 +; GFX12-NEXT: v_readfirstlane_b32 s1, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX12-NEXT: global_store_b16 v[0:1], v2, off +; GFX12-NEXT: s_setpc_b64 s[30:31] + %v = call bfloat @llvm.amdgcn.permlanex16.f16(bfloat %src0, bfloat %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store bfloat %v, ptr addrspace(1) %out + ret void +} + +define void @v_permlane16_i16(ptr addrspace(1) %out, i16 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlane16_i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_readfirstlane_b32 s4, v3 +; GFX10-NEXT: v_readfirstlane_b32 s5, v4 +; GFX10-NEXT: v_permlane16_b32 v2, v2, s4, s5 +; GFX10-NEXT: global_store_short v[0:1], v2, off +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_permlane16_i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-NEXT: v_readfirstlane_b32 s1, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX11-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_permlane16_i16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_readfirstlane_b32 s0, v3 +; GFX12-NEXT: v_readfirstlane_b32 s1, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX12-NEXT: global_store_b16 v[0:1], v2, off +; GFX12-NEXT: s_setpc_b64 s[30:31] + %v = call i16 @llvm.amdgcn.permlane16.i16(i16 %src0, i16 %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store i16 %v, ptr addrspace(1) %out + ret void +} + +define void @v_permlanex16_i16(ptr addrspace(1) %out, i16 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlanex16_i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_readfirstlane_b32 s4, v3 +; GFX10-NEXT: v_readfirstlane_b32 s5, v4 +; GFX10-NEXT: v_permlanex16_b32 v2, v2, s4, s5 +; GFX10-NEXT: global_store_short v[0:1], v2, off +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_permlanex16_i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-NEXT: v_readfirstlane_b32 s1, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX11-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_permlanex16_i16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_readfirstlane_b32 s0, v3 +; GFX12-NEXT: v_readfirstlane_b32 s1, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX12-NEXT: global_store_b16 v[0:1], v2, off +; GFX12-NEXT: s_setpc_b64 s[30:31] + %v = call i16 @llvm.amdgcn.permlanex16.i16(i16 %src0, i16 %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store i16 %v, ptr addrspace(1) %out + ret void +} + +define void @v_permlane16_v2f16(ptr addrspace(1) %out, <2 x half> %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlane16_v2f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_readfirstlane_b32 s4, v3 +; GFX10-NEXT: v_readfirstlane_b32 s5, v4 +; GFX10-NEXT: v_permlane16_b32 v2, v2, s4, s5 +; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_permlane16_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-NEXT: v_readfirstlane_b32 s1, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_permlane16_v2f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_readfirstlane_b32 s0, v3 +; GFX12-NEXT: v_readfirstlane_b32 s1, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX12-NEXT: global_store_b32 v[0:1], v2, off +; GFX12-NEXT: s_setpc_b64 s[30:31] + %v = call <2 x half> @llvm.amdgcn.permlane16.v2f16(<2 x half> %src0, <2 x half> %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store <2 x half> %v, ptr addrspace(1) %out + ret void +} + +define void @v_permlanex16_v2f16(ptr addrspace(1) %out, <2 x half> %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlanex16_v2f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_readfirstlane_b32 s4, v3 +; GFX10-NEXT: v_readfirstlane_b32 s5, v4 +; GFX10-NEXT: v_permlanex16_b32 v2, v2, s4, s5 +; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_permlanex16_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-NEXT: v_readfirstlane_b32 s1, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_permlanex16_v2f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_readfirstlane_b32 s0, v3 +; GFX12-NEXT: v_readfirstlane_b32 s1, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX12-NEXT: global_store_b32 v[0:1], v2, off +; GFX12-NEXT: s_setpc_b64 s[30:31] + %v = call <2 x half> @llvm.amdgcn.permlanex16.v2f16(<2 x half> %src0, <2 x half> %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store <2 x half> %v, ptr addrspace(1) %out + ret void +} + +define void @v_permlane16_v2f32(ptr addrspace(1) %out, <2 x float> %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_v2f32: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v4 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v3, v3, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v2, s4, s5 +; GFX10-SDAG-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: v_permlane16_v2f32: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v4 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v2, v2, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v3, v3, s4, s5 +; GFX10-GISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: v_permlane16_v2f32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v4 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v5 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX11-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: v_permlane16_v2f32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v4 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v5 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX11-GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-SDAG-LABEL: v_permlane16_v2f32: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v4 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v5 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX12-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-GISEL-LABEL: v_permlane16_v2f32: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v4 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v5 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX12-GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] + %v = call <2 x float> @llvm.amdgcn.permlane16.v2f32(<2 x float> %src0, <2 x float> %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store <2 x float> %v, ptr addrspace(1) %out + ret void +} + +define void @v_permlanex16_v2f32(ptr addrspace(1) %out, <2 x float> %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_v2f32: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v4 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v3, v3, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v2, s4, s5 +; GFX10-SDAG-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: v_permlanex16_v2f32: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v4 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v2, v2, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v3, v3, s4, s5 +; GFX10-GISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: v_permlanex16_v2f32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v4 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v5 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX11-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: v_permlanex16_v2f32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v4 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v5 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX11-GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-SDAG-LABEL: v_permlanex16_v2f32: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v4 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v5 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX12-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-GISEL-LABEL: v_permlanex16_v2f32: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v4 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v5 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX12-GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] + %v = call <2 x float> @llvm.amdgcn.permlanex16.v2f32(<2 x float> %src0, <2 x float> %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store <2 x float> %v, ptr addrspace(1) %out + ret void +} + +define void @v_permlane16_v7i32(ptr addrspace(1) %out, <7 x i32> %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_v7i32: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v9 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v10 +; GFX10-SDAG-NEXT: v_permlane16_b32 v8, v8, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v7, v7, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v6, v6, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v5, v5, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v4, v4, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v3, v3, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v2, s4, s5 +; GFX10-SDAG-NEXT: global_store_dwordx3 v[0:1], v[6:8], off offset:16 +; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: v_permlane16_v7i32: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v9 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v10 +; GFX10-GISEL-NEXT: v_permlane16_b32 v2, v2, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v3, v3, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v4, v4, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v5, v5, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v6, v6, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v7, v7, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v8, v8, s4, s5 +; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX10-GISEL-NEXT: global_store_dwordx3 v[0:1], v[6:8], off offset:16 +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: v_permlane16_v7i32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v9 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v10 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlane16_b32 v8, v8, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v7, v7, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v6, v6, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: v_permlane16_v7i32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v9 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v10 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v4, v4, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v5, v5, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v6, v6, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v7, v7, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v8, v8, s0, s1 +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-GISEL-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-SDAG-LABEL: v_permlane16_v7i32: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v9 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v10 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlane16_b32 v8, v8, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v7, v7, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v6, v6, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16 +; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-GISEL-LABEL: v_permlane16_v7i32: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v9 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v10 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v4, v4, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v5, v5, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v6, v6, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v7, v7, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v8, v8, s0, s1 +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX12-GISEL-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16 +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] + %v = call <7 x i32> @llvm.amdgcn.permlane16.v7i32(<7 x i32> %src0, <7 x i32> %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store <7 x i32> %v, ptr addrspace(1) %out + ret void +} + +define void @v_permlanex16_v7i32(ptr addrspace(1) %out, <7 x i32> %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_v7i32: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v9 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v10 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v8, v8, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v7, v7, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v6, v6, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v5, v5, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v4, v4, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v3, v3, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v2, s4, s5 +; GFX10-SDAG-NEXT: global_store_dwordx3 v[0:1], v[6:8], off offset:16 +; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: v_permlanex16_v7i32: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v9 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v10 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v2, v2, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v3, v3, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v4, v4, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v5, v5, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v6, v6, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v7, v7, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v8, v8, s4, s5 +; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX10-GISEL-NEXT: global_store_dwordx3 v[0:1], v[6:8], off offset:16 +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: v_permlanex16_v7i32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v9 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v10 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v8, v8, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v7, v7, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v6, v6, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v5, v5, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: v_permlanex16_v7i32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v9 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v10 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v4, v4, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v5, v5, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v6, v6, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v7, v7, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v8, v8, s0, s1 +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-GISEL-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-SDAG-LABEL: v_permlanex16_v7i32: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v9 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v10 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v8, v8, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v7, v7, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v6, v6, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v5, v5, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16 +; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-GISEL-LABEL: v_permlanex16_v7i32: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v9 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v10 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v4, v4, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v5, v5, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v6, v6, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v7, v7, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v8, v8, s0, s1 +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX12-GISEL-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16 +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] + %v = call <7 x i32> @llvm.amdgcn.permlanex16.v7i32(<7 x i32> %src0, <7 x i32> %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store <7 x i32> %v, ptr addrspace(1) %out + ret void +} + +define void @v_permlane16_v8i16(ptr addrspace(1) %out, <8 x i16> %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_v8i16: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v6 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v7 +; GFX10-SDAG-NEXT: v_permlane16_b32 v5, v5, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v4, v4, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v3, v3, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v2, s4, s5 +; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: v_permlane16_v8i16: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v6 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v7 +; GFX10-GISEL-NEXT: v_permlane16_b32 v2, v2, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v3, v3, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v4, v4, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v5, v5, s4, s5 +; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: v_permlane16_v8i16: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v6 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v7 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: v_permlane16_v8i16: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v6 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v7 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v4, v4, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v5, v5, s0, s1 +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-SDAG-LABEL: v_permlane16_v8i16: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v6 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v7 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-GISEL-LABEL: v_permlane16_v8i16: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v6 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v7 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v4, v4, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v5, v5, s0, s1 +; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] + %v = call <8 x i16> @llvm.amdgcn.permlane16.v8i16(<8 x i16> %src0, <8 x i16> %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store <8 x i16> %v, ptr addrspace(1) %out + ret void +} + +define void @v_permlanex16_v8i16(ptr addrspace(1) %out, <8 x i16> %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_v8i16: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v6 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v7 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v5, v5, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v4, v4, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v3, v3, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v2, s4, s5 +; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: v_permlanex16_v8i16: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v6 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v7 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v2, v2, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v3, v3, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v4, v4, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v5, v5, s4, s5 +; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: v_permlanex16_v8i16: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v6 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v7 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v5, v5, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: v_permlanex16_v8i16: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v6 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v7 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v4, v4, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v5, v5, s0, s1 +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-SDAG-LABEL: v_permlanex16_v8i16: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v6 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v7 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v5, v5, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-GISEL-LABEL: v_permlanex16_v8i16: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v6 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v7 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v4, v4, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v5, v5, s0, s1 +; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] + %v = call <8 x i16> @llvm.amdgcn.permlanex16.v8i16(<8 x i16> %src0, <8 x i16> %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store <8 x i16> %v, ptr addrspace(1) %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll b/llvm/test/CodeGen/AMDGPU/memory_clause.ll index ed1283af5c476..1de9206801e2a 100644 --- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll +++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll @@ -446,7 +446,7 @@ define amdgpu_kernel void @flat_scratch_load(float %a, float %b, <8 x i32> %desc %val = call <2 x float> @llvm.amdgcn.image.sample.2d.v2f32.f32(i32 9, float %a, float %b, <8 x i32> %desc, <4 x i32> , i1 false, i32 0, i32 0) %val0 = extractelement <2 x float> %val, i32 0 %valadd = fadd float %load, %val0 - call void @llvm.amdgcn.exp.f32(i32 immarg 0, i32 immarg 1, float %valadd, float undef, float undef, float undef, i1 immarg true, i1 immarg true) + call void @llvm.amdgcn.exp.f32(i32 0, i32 1, float %valadd, float undef, float undef, float undef, i1 true, i1 true) ret void } @@ -504,7 +504,7 @@ define amdgpu_kernel void @flat_scratch_load_clause(float %a, float %b, <8 x i32 %load0 = load float, ptr addrspace(5) %alloca %load1 = load float, ptr addrspace(5) %alloca2 %valadd = fadd float %load0, %load1 - call void @llvm.amdgcn.exp.f32(i32 immarg 0, i32 immarg 1, float %valadd, float undef, float undef, float undef, i1 immarg true, i1 immarg true) + call void @llvm.amdgcn.exp.f32(i32 0, i32 1, float %valadd, float undef, float undef, float undef, i1 true, i1 true) ret void } diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll index 21fd193e8db54..a547c258e3921 100644 --- a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll +++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll @@ -1,14 +1,18 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NO-PRELOAD %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-PRELOAD-1 %s ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=2 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-PRELOAD-2 %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=4 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-PRELOAD-4 %s ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=8 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-PRELOAD-8 %s ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-NO-PRELOAD %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-PRELOAD-1 %s ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=2 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-PRELOAD-2 %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=4 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-PRELOAD-4 %s ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=8 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-PRELOAD-8 %s -define amdgpu_kernel void @ptr1_i8_kernel_preload_arg(ptr addrspace(1) %out, i8 %arg0) #0 { -; GFX940-NO-PRELOAD-LABEL: ptr1_i8_kernel_preload_arg: +define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) %out, i8 %arg0) #0 { +; GFX940-NO-PRELOAD-LABEL: ptr1_i8: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -19,7 +23,18 @@ define amdgpu_kernel void @ptr1_i8_kernel_preload_arg(ptr addrspace(1) %out, i8 ; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-2-LABEL: ptr1_i8_kernel_preload_arg: +; GFX940-PRELOAD-1-LABEL: ptr1_i8: +; GFX940-PRELOAD-1: ; %bb.0: +; GFX940-PRELOAD-1-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-1-NEXT: s_and_b32 s0, s4, 0xff +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-1-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: ptr1_i8: ; GFX940-PRELOAD-2: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -30,7 +45,18 @@ define amdgpu_kernel void @ptr1_i8_kernel_preload_arg(ptr addrspace(1) %out, i8 ; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-8-LABEL: ptr1_i8_kernel_preload_arg: +; GFX940-PRELOAD-4-LABEL: ptr1_i8: +; GFX940-PRELOAD-4: ; %bb.0: +; GFX940-PRELOAD-4-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-4-NEXT: s_and_b32 s0, s4, 0xff +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-4-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: ptr1_i8: ; GFX940-PRELOAD-8: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -41,7 +67,7 @@ define amdgpu_kernel void @ptr1_i8_kernel_preload_arg(ptr addrspace(1) %out, i8 ; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: ptr1_i8_kernel_preload_arg: +; GFX90a-NO-PRELOAD-LABEL: ptr1_i8: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -52,7 +78,18 @@ define amdgpu_kernel void @ptr1_i8_kernel_preload_arg(ptr addrspace(1) %out, i8 ; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-2-LABEL: ptr1_i8_kernel_preload_arg: +; GFX90a-PRELOAD-1-LABEL: ptr1_i8: +; GFX90a-PRELOAD-1: ; %bb.0: +; GFX90a-PRELOAD-1-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-1-NEXT: s_and_b32 s2, s2, 0xff +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90a-PRELOAD-1-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: ptr1_i8: ; GFX90a-PRELOAD-2: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -63,7 +100,18 @@ define amdgpu_kernel void @ptr1_i8_kernel_preload_arg(ptr addrspace(1) %out, i8 ; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-8-LABEL: ptr1_i8_kernel_preload_arg: +; GFX90a-PRELOAD-4-LABEL: ptr1_i8: +; GFX90a-PRELOAD-4: ; %bb.0: +; GFX90a-PRELOAD-4-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-4-NEXT: s_and_b32 s2, s2, 0xff +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90a-PRELOAD-4-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: ptr1_i8: ; GFX90a-PRELOAD-8: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -78,8 +126,8 @@ define amdgpu_kernel void @ptr1_i8_kernel_preload_arg(ptr addrspace(1) %out, i8 ret void } -define amdgpu_kernel void @ptr1_i8_zext_kernel_preload_arg(ptr addrspace(1) %out, i8 zeroext %arg0) #0 { -; GFX940-NO-PRELOAD-LABEL: ptr1_i8_zext_kernel_preload_arg: +define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) %out, i8 zeroext %arg0) #0 { +; GFX940-NO-PRELOAD-LABEL: ptr1_i8_zext_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -90,7 +138,18 @@ define amdgpu_kernel void @ptr1_i8_zext_kernel_preload_arg(ptr addrspace(1) %out ; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-2-LABEL: ptr1_i8_zext_kernel_preload_arg: +; GFX940-PRELOAD-1-LABEL: ptr1_i8_zext_arg: +; GFX940-PRELOAD-1: ; %bb.0: +; GFX940-PRELOAD-1-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-1-NEXT: s_and_b32 s0, s4, 0xff +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-1-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: ptr1_i8_zext_arg: ; GFX940-PRELOAD-2: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -101,7 +160,18 @@ define amdgpu_kernel void @ptr1_i8_zext_kernel_preload_arg(ptr addrspace(1) %out ; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-8-LABEL: ptr1_i8_zext_kernel_preload_arg: +; GFX940-PRELOAD-4-LABEL: ptr1_i8_zext_arg: +; GFX940-PRELOAD-4: ; %bb.0: +; GFX940-PRELOAD-4-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-4-NEXT: s_and_b32 s0, s4, 0xff +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-4-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: ptr1_i8_zext_arg: ; GFX940-PRELOAD-8: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -112,7 +182,7 @@ define amdgpu_kernel void @ptr1_i8_zext_kernel_preload_arg(ptr addrspace(1) %out ; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: ptr1_i8_zext_kernel_preload_arg: +; GFX90a-NO-PRELOAD-LABEL: ptr1_i8_zext_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -123,7 +193,18 @@ define amdgpu_kernel void @ptr1_i8_zext_kernel_preload_arg(ptr addrspace(1) %out ; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-2-LABEL: ptr1_i8_zext_kernel_preload_arg: +; GFX90a-PRELOAD-1-LABEL: ptr1_i8_zext_arg: +; GFX90a-PRELOAD-1: ; %bb.0: +; GFX90a-PRELOAD-1-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-1-NEXT: s_and_b32 s2, s2, 0xff +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90a-PRELOAD-1-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: ptr1_i8_zext_arg: ; GFX90a-PRELOAD-2: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -134,7 +215,18 @@ define amdgpu_kernel void @ptr1_i8_zext_kernel_preload_arg(ptr addrspace(1) %out ; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-8-LABEL: ptr1_i8_zext_kernel_preload_arg: +; GFX90a-PRELOAD-4-LABEL: ptr1_i8_zext_arg: +; GFX90a-PRELOAD-4: ; %bb.0: +; GFX90a-PRELOAD-4-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-4-NEXT: s_and_b32 s2, s2, 0xff +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90a-PRELOAD-4-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: ptr1_i8_zext_arg: ; GFX90a-PRELOAD-8: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -149,8 +241,8 @@ define amdgpu_kernel void @ptr1_i8_zext_kernel_preload_arg(ptr addrspace(1) %out ret void } -define amdgpu_kernel void @ptr1_i16_kernel_preload_arg(ptr addrspace(1) %out, i16 %arg0) #0 { -; GFX940-NO-PRELOAD-LABEL: ptr1_i16_kernel_preload_arg: +define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0) #0 { +; GFX940-NO-PRELOAD-LABEL: ptr1_i16_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -161,7 +253,18 @@ define amdgpu_kernel void @ptr1_i16_kernel_preload_arg(ptr addrspace(1) %out, i1 ; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-2-LABEL: ptr1_i16_kernel_preload_arg: +; GFX940-PRELOAD-1-LABEL: ptr1_i16_preload_arg: +; GFX940-PRELOAD-1: ; %bb.0: +; GFX940-PRELOAD-1-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-1-NEXT: s_and_b32 s0, s4, 0xffff +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-1-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: ptr1_i16_preload_arg: ; GFX940-PRELOAD-2: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -172,7 +275,18 @@ define amdgpu_kernel void @ptr1_i16_kernel_preload_arg(ptr addrspace(1) %out, i1 ; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-8-LABEL: ptr1_i16_kernel_preload_arg: +; GFX940-PRELOAD-4-LABEL: ptr1_i16_preload_arg: +; GFX940-PRELOAD-4: ; %bb.0: +; GFX940-PRELOAD-4-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-4-NEXT: s_and_b32 s0, s4, 0xffff +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-4-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: ptr1_i16_preload_arg: ; GFX940-PRELOAD-8: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -183,7 +297,7 @@ define amdgpu_kernel void @ptr1_i16_kernel_preload_arg(ptr addrspace(1) %out, i1 ; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: ptr1_i16_kernel_preload_arg: +; GFX90a-NO-PRELOAD-LABEL: ptr1_i16_preload_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -194,7 +308,18 @@ define amdgpu_kernel void @ptr1_i16_kernel_preload_arg(ptr addrspace(1) %out, i1 ; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-2-LABEL: ptr1_i16_kernel_preload_arg: +; GFX90a-PRELOAD-1-LABEL: ptr1_i16_preload_arg: +; GFX90a-PRELOAD-1: ; %bb.0: +; GFX90a-PRELOAD-1-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-1-NEXT: s_and_b32 s2, s2, 0xffff +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90a-PRELOAD-1-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: ptr1_i16_preload_arg: ; GFX90a-PRELOAD-2: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -205,7 +330,18 @@ define amdgpu_kernel void @ptr1_i16_kernel_preload_arg(ptr addrspace(1) %out, i1 ; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-8-LABEL: ptr1_i16_kernel_preload_arg: +; GFX90a-PRELOAD-4-LABEL: ptr1_i16_preload_arg: +; GFX90a-PRELOAD-4: ; %bb.0: +; GFX90a-PRELOAD-4-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-4-NEXT: s_and_b32 s2, s2, 0xffff +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90a-PRELOAD-4-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: ptr1_i16_preload_arg: ; GFX90a-PRELOAD-8: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -220,8 +356,8 @@ define amdgpu_kernel void @ptr1_i16_kernel_preload_arg(ptr addrspace(1) %out, i1 ret void } -define amdgpu_kernel void @ptr1_i32_kernel_preload_arg(ptr addrspace(1) %out, i32 %arg0) #0 { -; GFX940-NO-PRELOAD-LABEL: ptr1_i32_kernel_preload_arg: +define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) %out, i32 %arg0) #0 { +; GFX940-NO-PRELOAD-LABEL: ptr1_i32_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -231,7 +367,17 @@ define amdgpu_kernel void @ptr1_i32_kernel_preload_arg(ptr addrspace(1) %out, i3 ; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-2-LABEL: ptr1_i32_kernel_preload_arg: +; GFX940-PRELOAD-1-LABEL: ptr1_i32_preload_arg: +; GFX940-PRELOAD-1: ; %bb.0: +; GFX940-PRELOAD-1-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-1-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: ptr1_i32_preload_arg: ; GFX940-PRELOAD-2: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -241,7 +387,17 @@ define amdgpu_kernel void @ptr1_i32_kernel_preload_arg(ptr addrspace(1) %out, i3 ; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-8-LABEL: ptr1_i32_kernel_preload_arg: +; GFX940-PRELOAD-4-LABEL: ptr1_i32_preload_arg: +; GFX940-PRELOAD-4: ; %bb.0: +; GFX940-PRELOAD-4-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-4-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: ptr1_i32_preload_arg: ; GFX940-PRELOAD-8: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -251,7 +407,7 @@ define amdgpu_kernel void @ptr1_i32_kernel_preload_arg(ptr addrspace(1) %out, i3 ; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: ptr1_i32_kernel_preload_arg: +; GFX90a-NO-PRELOAD-LABEL: ptr1_i32_preload_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -261,7 +417,17 @@ define amdgpu_kernel void @ptr1_i32_kernel_preload_arg(ptr addrspace(1) %out, i3 ; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-2-LABEL: ptr1_i32_kernel_preload_arg: +; GFX90a-PRELOAD-1-LABEL: ptr1_i32_preload_arg: +; GFX90a-PRELOAD-1: ; %bb.0: +; GFX90a-PRELOAD-1-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90a-PRELOAD-1-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: ptr1_i32_preload_arg: ; GFX90a-PRELOAD-2: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -271,7 +437,17 @@ define amdgpu_kernel void @ptr1_i32_kernel_preload_arg(ptr addrspace(1) %out, i3 ; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-8-LABEL: ptr1_i32_kernel_preload_arg: +; GFX90a-PRELOAD-4-LABEL: ptr1_i32_preload_arg: +; GFX90a-PRELOAD-4: ; %bb.0: +; GFX90a-PRELOAD-4-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90a-PRELOAD-4-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: ptr1_i32_preload_arg: ; GFX90a-PRELOAD-8: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -285,8 +461,8 @@ define amdgpu_kernel void @ptr1_i32_kernel_preload_arg(ptr addrspace(1) %out, i3 } -define amdgpu_kernel void @i32_ptr1_i32_kernel_preload_arg(i32 %arg0, ptr addrspace(1) %out, i32 %arg1) #0 { -; GFX940-NO-PRELOAD-LABEL: i32_ptr1_i32_kernel_preload_arg: +define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 %arg0, ptr addrspace(1) %out, i32 %arg1) #0 { +; GFX940-NO-PRELOAD-LABEL: i32_ptr1_i32_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x10 ; GFX940-NO-PRELOAD-NEXT: s_load_dword s5, s[0:1], 0x0 @@ -298,7 +474,19 @@ define amdgpu_kernel void @i32_ptr1_i32_kernel_preload_arg(i32 %arg0, ptr addrsp ; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-2-LABEL: i32_ptr1_i32_kernel_preload_arg: +; GFX940-PRELOAD-1-LABEL: i32_ptr1_i32_preload_arg: +; GFX940-PRELOAD-1: ; %bb.0: +; GFX940-PRELOAD-1-NEXT: s_load_dword s4, s[0:1], 0x10 +; GFX940-PRELOAD-1-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-1-NEXT: s_add_i32 s0, s5, s4 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-1-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: i32_ptr1_i32_preload_arg: ; GFX940-PRELOAD-2: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: s_load_dword s4, s[0:1], 0x10 ; GFX940-PRELOAD-2-NEXT: s_load_dword s5, s[0:1], 0x0 @@ -310,7 +498,19 @@ define amdgpu_kernel void @i32_ptr1_i32_kernel_preload_arg(i32 %arg0, ptr addrsp ; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-8-LABEL: i32_ptr1_i32_kernel_preload_arg: +; GFX940-PRELOAD-4-LABEL: i32_ptr1_i32_preload_arg: +; GFX940-PRELOAD-4: ; %bb.0: +; GFX940-PRELOAD-4-NEXT: s_load_dword s4, s[0:1], 0x10 +; GFX940-PRELOAD-4-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-4-NEXT: s_add_i32 s0, s5, s4 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-4-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: i32_ptr1_i32_preload_arg: ; GFX940-PRELOAD-8: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: s_load_dword s4, s[0:1], 0x10 ; GFX940-PRELOAD-8-NEXT: s_load_dword s5, s[0:1], 0x0 @@ -322,7 +522,7 @@ define amdgpu_kernel void @i32_ptr1_i32_kernel_preload_arg(i32 %arg0, ptr addrsp ; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: i32_ptr1_i32_kernel_preload_arg: +; GFX90a-NO-PRELOAD-LABEL: i32_ptr1_i32_preload_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x10 ; GFX90a-NO-PRELOAD-NEXT: s_load_dword s3, s[4:5], 0x0 @@ -334,7 +534,19 @@ define amdgpu_kernel void @i32_ptr1_i32_kernel_preload_arg(i32 %arg0, ptr addrsp ; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-2-LABEL: i32_ptr1_i32_kernel_preload_arg: +; GFX90a-PRELOAD-1-LABEL: i32_ptr1_i32_preload_arg: +; GFX90a-PRELOAD-1: ; %bb.0: +; GFX90a-PRELOAD-1-NEXT: s_load_dword s2, s[4:5], 0x10 +; GFX90a-PRELOAD-1-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-1-NEXT: s_add_i32 s2, s3, s2 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90a-PRELOAD-1-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: i32_ptr1_i32_preload_arg: ; GFX90a-PRELOAD-2: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x10 ; GFX90a-PRELOAD-2-NEXT: s_load_dword s3, s[4:5], 0x0 @@ -346,7 +558,19 @@ define amdgpu_kernel void @i32_ptr1_i32_kernel_preload_arg(i32 %arg0, ptr addrsp ; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-8-LABEL: i32_ptr1_i32_kernel_preload_arg: +; GFX90a-PRELOAD-4-LABEL: i32_ptr1_i32_preload_arg: +; GFX90a-PRELOAD-4: ; %bb.0: +; GFX90a-PRELOAD-4-NEXT: s_load_dword s2, s[4:5], 0x10 +; GFX90a-PRELOAD-4-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-4-NEXT: s_add_i32 s2, s3, s2 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90a-PRELOAD-4-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: i32_ptr1_i32_preload_arg: ; GFX90a-PRELOAD-8: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: s_load_dword s2, s[4:5], 0x10 ; GFX90a-PRELOAD-8-NEXT: s_load_dword s3, s[4:5], 0x0 @@ -362,8 +586,8 @@ define amdgpu_kernel void @i32_ptr1_i32_kernel_preload_arg(i32 %arg0, ptr addrsp ret void } -define amdgpu_kernel void @ptr1_i16_i16_kernel_preload_arg(ptr addrspace(1) %out, i16 %arg0, i16 %arg1) #0 { -; GFX940-NO-PRELOAD-LABEL: ptr1_i16_i16_kernel_preload_arg: +define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0, i16 %arg1) #0 { +; GFX940-NO-PRELOAD-LABEL: ptr1_i16_i16_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -376,7 +600,20 @@ define amdgpu_kernel void @ptr1_i16_i16_kernel_preload_arg(ptr addrspace(1) %out ; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-2-LABEL: ptr1_i16_i16_kernel_preload_arg: +; GFX940-PRELOAD-1-LABEL: ptr1_i16_i16_preload_arg: +; GFX940-PRELOAD-1: ; %bb.0: +; GFX940-PRELOAD-1-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-1-NEXT: s_lshr_b32 s0, s4, 16 +; GFX940-PRELOAD-1-NEXT: s_and_b32 s1, s4, 0xffff +; GFX940-PRELOAD-1-NEXT: s_add_i32 s0, s1, s0 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-1-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: ptr1_i16_i16_preload_arg: ; GFX940-PRELOAD-2: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -389,7 +626,20 @@ define amdgpu_kernel void @ptr1_i16_i16_kernel_preload_arg(ptr addrspace(1) %out ; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-8-LABEL: ptr1_i16_i16_kernel_preload_arg: +; GFX940-PRELOAD-4-LABEL: ptr1_i16_i16_preload_arg: +; GFX940-PRELOAD-4: ; %bb.0: +; GFX940-PRELOAD-4-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-4-NEXT: s_lshr_b32 s0, s4, 16 +; GFX940-PRELOAD-4-NEXT: s_and_b32 s1, s4, 0xffff +; GFX940-PRELOAD-4-NEXT: s_add_i32 s0, s1, s0 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-4-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: ptr1_i16_i16_preload_arg: ; GFX940-PRELOAD-8: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -402,7 +652,7 @@ define amdgpu_kernel void @ptr1_i16_i16_kernel_preload_arg(ptr addrspace(1) %out ; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: ptr1_i16_i16_kernel_preload_arg: +; GFX90a-NO-PRELOAD-LABEL: ptr1_i16_i16_preload_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -415,7 +665,20 @@ define amdgpu_kernel void @ptr1_i16_i16_kernel_preload_arg(ptr addrspace(1) %out ; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-2-LABEL: ptr1_i16_i16_kernel_preload_arg: +; GFX90a-PRELOAD-1-LABEL: ptr1_i16_i16_preload_arg: +; GFX90a-PRELOAD-1: ; %bb.0: +; GFX90a-PRELOAD-1-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-1-NEXT: s_lshr_b32 s3, s2, 16 +; GFX90a-PRELOAD-1-NEXT: s_and_b32 s2, s2, 0xffff +; GFX90a-PRELOAD-1-NEXT: s_add_i32 s2, s2, s3 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90a-PRELOAD-1-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: ptr1_i16_i16_preload_arg: ; GFX90a-PRELOAD-2: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -428,7 +691,20 @@ define amdgpu_kernel void @ptr1_i16_i16_kernel_preload_arg(ptr addrspace(1) %out ; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-8-LABEL: ptr1_i16_i16_kernel_preload_arg: +; GFX90a-PRELOAD-4-LABEL: ptr1_i16_i16_preload_arg: +; GFX90a-PRELOAD-4: ; %bb.0: +; GFX90a-PRELOAD-4-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-4-NEXT: s_lshr_b32 s3, s2, 16 +; GFX90a-PRELOAD-4-NEXT: s_and_b32 s2, s2, 0xffff +; GFX90a-PRELOAD-4-NEXT: s_add_i32 s2, s2, s3 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90a-PRELOAD-4-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: ptr1_i16_i16_preload_arg: ; GFX90a-PRELOAD-8: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -447,8 +723,8 @@ define amdgpu_kernel void @ptr1_i16_i16_kernel_preload_arg(ptr addrspace(1) %out ret void } -define amdgpu_kernel void @ptr1_v2i8_kernel_preload_arg(ptr addrspace(1) %out, <2 x i8> %in) #0 { -; GFX940-NO-PRELOAD-LABEL: ptr1_v2i8_kernel_preload_arg: +define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) %out, <2 x i8> %in) #0 { +; GFX940-NO-PRELOAD-LABEL: ptr1_v2i8_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -458,7 +734,17 @@ define amdgpu_kernel void @ptr1_v2i8_kernel_preload_arg(ptr addrspace(1) %out, < ; GFX940-NO-PRELOAD-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-2-LABEL: ptr1_v2i8_kernel_preload_arg: +; GFX940-PRELOAD-1-LABEL: ptr1_v2i8_preload_arg: +; GFX940-PRELOAD-1: ; %bb.0: +; GFX940-PRELOAD-1-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-PRELOAD-1-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-1-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: ptr1_v2i8_preload_arg: ; GFX940-PRELOAD-2: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -468,7 +754,17 @@ define amdgpu_kernel void @ptr1_v2i8_kernel_preload_arg(ptr addrspace(1) %out, < ; GFX940-PRELOAD-2-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-8-LABEL: ptr1_v2i8_kernel_preload_arg: +; GFX940-PRELOAD-4-LABEL: ptr1_v2i8_preload_arg: +; GFX940-PRELOAD-4: ; %bb.0: +; GFX940-PRELOAD-4-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-PRELOAD-4-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-4-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: ptr1_v2i8_preload_arg: ; GFX940-PRELOAD-8: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -478,7 +774,7 @@ define amdgpu_kernel void @ptr1_v2i8_kernel_preload_arg(ptr addrspace(1) %out, < ; GFX940-PRELOAD-8-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: ptr1_v2i8_kernel_preload_arg: +; GFX90a-NO-PRELOAD-LABEL: ptr1_v2i8_preload_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -488,7 +784,17 @@ define amdgpu_kernel void @ptr1_v2i8_kernel_preload_arg(ptr addrspace(1) %out, < ; GFX90a-NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-2-LABEL: ptr1_v2i8_kernel_preload_arg: +; GFX90a-PRELOAD-1-LABEL: ptr1_v2i8_preload_arg: +; GFX90a-PRELOAD-1: ; %bb.0: +; GFX90a-PRELOAD-1-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-1-NEXT: global_store_short v0, v1, s[0:1] +; GFX90a-PRELOAD-1-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: ptr1_v2i8_preload_arg: ; GFX90a-PRELOAD-2: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -498,7 +804,17 @@ define amdgpu_kernel void @ptr1_v2i8_kernel_preload_arg(ptr addrspace(1) %out, < ; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[0:1] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-8-LABEL: ptr1_v2i8_kernel_preload_arg: +; GFX90a-PRELOAD-4-LABEL: ptr1_v2i8_preload_arg: +; GFX90a-PRELOAD-4: ; %bb.0: +; GFX90a-PRELOAD-4-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-4-NEXT: global_store_short v0, v1, s[0:1] +; GFX90a-PRELOAD-4-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: ptr1_v2i8_preload_arg: ; GFX90a-PRELOAD-8: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -512,8 +828,8 @@ define amdgpu_kernel void @ptr1_v2i8_kernel_preload_arg(ptr addrspace(1) %out, < } -define amdgpu_kernel void @byref_kernel_preload_arg(ptr addrspace(1) %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 %after.offset) #0 { -; GFX940-NO-PRELOAD-LABEL: byref_kernel_preload_arg: +define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 %after.offset) #0 { +; GFX940-NO-PRELOAD-LABEL: byref_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x100 ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 @@ -527,7 +843,21 @@ define amdgpu_kernel void @byref_kernel_preload_arg(ptr addrspace(1) %out, ptr a ; GFX940-NO-PRELOAD-NEXT: s_waitcnt vmcnt(0) ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-2-LABEL: byref_kernel_preload_arg: +; GFX940-PRELOAD-1-LABEL: byref_preload_arg: +; GFX940-PRELOAD-1: ; %bb.0: +; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x100 +; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s3 +; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 +; GFX940-PRELOAD-1-NEXT: s_waitcnt vmcnt(0) +; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v2, s[4:5] sc0 sc1 +; GFX940-PRELOAD-1-NEXT: s_waitcnt vmcnt(0) +; GFX940-PRELOAD-1-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: byref_preload_arg: ; GFX940-PRELOAD-2: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x100 ; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 @@ -541,7 +871,21 @@ define amdgpu_kernel void @byref_kernel_preload_arg(ptr addrspace(1) %out, ptr a ; GFX940-PRELOAD-2-NEXT: s_waitcnt vmcnt(0) ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-8-LABEL: byref_kernel_preload_arg: +; GFX940-PRELOAD-4-LABEL: byref_preload_arg: +; GFX940-PRELOAD-4: ; %bb.0: +; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x100 +; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s3 +; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 +; GFX940-PRELOAD-4-NEXT: s_waitcnt vmcnt(0) +; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v2, s[4:5] sc0 sc1 +; GFX940-PRELOAD-4-NEXT: s_waitcnt vmcnt(0) +; GFX940-PRELOAD-4-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: byref_preload_arg: ; GFX940-PRELOAD-8: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x100 ; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 @@ -555,7 +899,7 @@ define amdgpu_kernel void @byref_kernel_preload_arg(ptr addrspace(1) %out, ptr a ; GFX940-PRELOAD-8-NEXT: s_waitcnt vmcnt(0) ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: byref_kernel_preload_arg: +; GFX90a-NO-PRELOAD-LABEL: byref_preload_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100 ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 @@ -569,7 +913,21 @@ define amdgpu_kernel void @byref_kernel_preload_arg(ptr addrspace(1) %out, ptr a ; GFX90a-NO-PRELOAD-NEXT: s_waitcnt vmcnt(0) ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-2-LABEL: byref_kernel_preload_arg: +; GFX90a-PRELOAD-1-LABEL: byref_preload_arg: +; GFX90a-PRELOAD-1: ; %bb.0: +; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100 +; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s0 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s1 +; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90a-PRELOAD-1-NEXT: s_waitcnt vmcnt(0) +; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v2, s[2:3] +; GFX90a-PRELOAD-1-NEXT: s_waitcnt vmcnt(0) +; GFX90a-PRELOAD-1-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: byref_preload_arg: ; GFX90a-PRELOAD-2: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100 ; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 @@ -583,7 +941,21 @@ define amdgpu_kernel void @byref_kernel_preload_arg(ptr addrspace(1) %out, ptr a ; GFX90a-PRELOAD-2-NEXT: s_waitcnt vmcnt(0) ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-8-LABEL: byref_kernel_preload_arg: +; GFX90a-PRELOAD-4-LABEL: byref_preload_arg: +; GFX90a-PRELOAD-4: ; %bb.0: +; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100 +; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s0 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s1 +; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90a-PRELOAD-4-NEXT: s_waitcnt vmcnt(0) +; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v2, s[2:3] +; GFX90a-PRELOAD-4-NEXT: s_waitcnt vmcnt(0) +; GFX90a-PRELOAD-4-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: byref_preload_arg: ; GFX90a-PRELOAD-8: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100 ; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 @@ -603,8 +975,8 @@ define amdgpu_kernel void @byref_kernel_preload_arg(ptr addrspace(1) %out, ptr a } -define amdgpu_kernel void @v8i32_kernel_preload_arg(ptr addrspace(1) nocapture %out, <8 x i32> %in) #0 { -; GFX940-NO-PRELOAD-LABEL: v8i32_kernel_preload_arg: +define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> %in) #0 { +; GFX940-NO-PRELOAD-LABEL: v8i32_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x20 ; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v4, 0 @@ -623,7 +995,26 @@ define amdgpu_kernel void @v8i32_kernel_preload_arg(ptr addrspace(1) nocapture % ; GFX940-NO-PRELOAD-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-2-LABEL: v8i32_kernel_preload_arg: +; GFX940-PRELOAD-1-LABEL: v8i32_arg: +; GFX940-PRELOAD-1: ; %bb.0: +; GFX940-PRELOAD-1-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x20 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s8 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s9 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s10 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v3, s11 +; GFX940-PRELOAD-1-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-PRELOAD-1-NEXT: s_nop 1 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v3, s7 +; GFX940-PRELOAD-1-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-PRELOAD-1-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: v8i32_arg: ; GFX940-PRELOAD-2: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x20 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v4, 0 @@ -642,7 +1033,26 @@ define amdgpu_kernel void @v8i32_kernel_preload_arg(ptr addrspace(1) nocapture % ; GFX940-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-8-LABEL: v8i32_kernel_preload_arg: +; GFX940-PRELOAD-4-LABEL: v8i32_arg: +; GFX940-PRELOAD-4: ; %bb.0: +; GFX940-PRELOAD-4-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x20 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s8 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s9 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s10 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v3, s11 +; GFX940-PRELOAD-4-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-PRELOAD-4-NEXT: s_nop 1 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v3, s7 +; GFX940-PRELOAD-4-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-PRELOAD-4-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: v8i32_arg: ; GFX940-PRELOAD-8: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x20 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v4, 0 @@ -661,7 +1071,7 @@ define amdgpu_kernel void @v8i32_kernel_preload_arg(ptr addrspace(1) nocapture % ; GFX940-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: v8i32_kernel_preload_arg: +; GFX90a-NO-PRELOAD-LABEL: v8i32_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -680,7 +1090,26 @@ define amdgpu_kernel void @v8i32_kernel_preload_arg(ptr addrspace(1) nocapture % ; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-2-LABEL: v8i32_kernel_preload_arg: +; GFX90a-PRELOAD-1-LABEL: v8i32_arg: +; GFX90a-PRELOAD-1: ; %bb.0: +; GFX90a-PRELOAD-1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 +; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v4, 0 +; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s12 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s13 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s14 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v3, s15 +; GFX90a-PRELOAD-1-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 +; GFX90a-PRELOAD-1-NEXT: s_nop 0 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s8 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s9 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s10 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v3, s11 +; GFX90a-PRELOAD-1-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX90a-PRELOAD-1-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: v8i32_arg: ; GFX90a-PRELOAD-2: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 ; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -699,7 +1128,26 @@ define amdgpu_kernel void @v8i32_kernel_preload_arg(ptr addrspace(1) nocapture % ; GFX90a-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-8-LABEL: v8i32_kernel_preload_arg: +; GFX90a-PRELOAD-4-LABEL: v8i32_arg: +; GFX90a-PRELOAD-4: ; %bb.0: +; GFX90a-PRELOAD-4-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 +; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v4, 0 +; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s12 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s13 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s14 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v3, s15 +; GFX90a-PRELOAD-4-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 +; GFX90a-PRELOAD-4-NEXT: s_nop 0 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s8 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s9 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s10 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v3, s11 +; GFX90a-PRELOAD-4-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX90a-PRELOAD-4-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: v8i32_arg: ; GFX90a-PRELOAD-8: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 ; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -721,8 +1169,8 @@ define amdgpu_kernel void @v8i32_kernel_preload_arg(ptr addrspace(1) nocapture % ret void } -define amdgpu_kernel void @v3i16_kernel_preload_arg(ptr addrspace(1) nocapture %out, <3 x i16> %in) #0 { -; GFX940-NO-PRELOAD-LABEL: v3i16_kernel_preload_arg: +define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture %out, <3 x i16> %in) #0 { +; GFX940-NO-PRELOAD-LABEL: v3i16_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 @@ -733,7 +1181,18 @@ define amdgpu_kernel void @v3i16_kernel_preload_arg(ptr addrspace(1) nocapture % ; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-2-LABEL: v3i16_kernel_preload_arg: +; GFX940-PRELOAD-1-LABEL: v3i16_preload_arg: +; GFX940-PRELOAD-1: ; %bb.0: +; GFX940-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s2 +; GFX940-PRELOAD-1-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-PRELOAD-1-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: v3i16_preload_arg: ; GFX940-PRELOAD-2: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 @@ -744,7 +1203,18 @@ define amdgpu_kernel void @v3i16_kernel_preload_arg(ptr addrspace(1) nocapture % ; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-8-LABEL: v3i16_kernel_preload_arg: +; GFX940-PRELOAD-4-LABEL: v3i16_preload_arg: +; GFX940-PRELOAD-4: ; %bb.0: +; GFX940-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s2 +; GFX940-PRELOAD-4-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-PRELOAD-4-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: v3i16_preload_arg: ; GFX940-PRELOAD-8: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 @@ -755,7 +1225,7 @@ define amdgpu_kernel void @v3i16_kernel_preload_arg(ptr addrspace(1) nocapture % ; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: v3i16_kernel_preload_arg: +; GFX90a-NO-PRELOAD-LABEL: v3i16_preload_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 @@ -766,7 +1236,18 @@ define amdgpu_kernel void @v3i16_kernel_preload_arg(ptr addrspace(1) nocapture % ; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[0:1] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-2-LABEL: v3i16_kernel_preload_arg: +; GFX90a-PRELOAD-1-LABEL: v3i16_preload_arg: +; GFX90a-PRELOAD-1: ; %bb.0: +; GFX90a-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s3 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s2 +; GFX90a-PRELOAD-1-NEXT: global_store_short v0, v1, s[0:1] offset:4 +; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v2, s[0:1] +; GFX90a-PRELOAD-1-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: v3i16_preload_arg: ; GFX90a-PRELOAD-2: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 @@ -777,7 +1258,18 @@ define amdgpu_kernel void @v3i16_kernel_preload_arg(ptr addrspace(1) nocapture % ; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v2, s[0:1] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-8-LABEL: v3i16_kernel_preload_arg: +; GFX90a-PRELOAD-4-LABEL: v3i16_preload_arg: +; GFX90a-PRELOAD-4: ; %bb.0: +; GFX90a-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s3 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s2 +; GFX90a-PRELOAD-4-NEXT: global_store_short v0, v1, s[0:1] offset:4 +; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v2, s[0:1] +; GFX90a-PRELOAD-4-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: v3i16_preload_arg: ; GFX90a-PRELOAD-8: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 @@ -791,8 +1283,8 @@ define amdgpu_kernel void @v3i16_kernel_preload_arg(ptr addrspace(1) nocapture % ret void } -define amdgpu_kernel void @v3i32_kernel_preload_arg(ptr addrspace(1) nocapture %out, <3 x i32> %in) #0 { -; GFX940-NO-PRELOAD-LABEL: v3i32_kernel_preload_arg: +define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture %out, <3 x i32> %in) #0 { +; GFX940-NO-PRELOAD-LABEL: v3i32_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -804,7 +1296,19 @@ define amdgpu_kernel void @v3i32_kernel_preload_arg(ptr addrspace(1) nocapture % ; GFX940-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-2-LABEL: v3i32_kernel_preload_arg: +; GFX940-PRELOAD-1-LABEL: v3i32_preload_arg: +; GFX940-PRELOAD-1: ; %bb.0: +; GFX940-PRELOAD-1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-PRELOAD-1-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 +; GFX940-PRELOAD-1-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: v3i32_preload_arg: ; GFX940-PRELOAD-2: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -816,7 +1320,19 @@ define amdgpu_kernel void @v3i32_kernel_preload_arg(ptr addrspace(1) nocapture % ; GFX940-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-8-LABEL: v3i32_kernel_preload_arg: +; GFX940-PRELOAD-4-LABEL: v3i32_preload_arg: +; GFX940-PRELOAD-4: ; %bb.0: +; GFX940-PRELOAD-4-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-PRELOAD-4-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 +; GFX940-PRELOAD-4-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: v3i32_preload_arg: ; GFX940-PRELOAD-8: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -828,7 +1344,7 @@ define amdgpu_kernel void @v3i32_kernel_preload_arg(ptr addrspace(1) nocapture % ; GFX940-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: v3i32_kernel_preload_arg: +; GFX90a-NO-PRELOAD-LABEL: v3i32_preload_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 @@ -840,7 +1356,19 @@ define amdgpu_kernel void @v3i32_kernel_preload_arg(ptr addrspace(1) nocapture % ; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-2-LABEL: v3i32_kernel_preload_arg: +; GFX90a-PRELOAD-1-LABEL: v3i32_preload_arg: +; GFX90a-PRELOAD-1: ; %bb.0: +; GFX90a-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v3, 0 +; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s0 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s1 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s2 +; GFX90a-PRELOAD-1-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; GFX90a-PRELOAD-1-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: v3i32_preload_arg: ; GFX90a-PRELOAD-2: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 ; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 @@ -852,7 +1380,19 @@ define amdgpu_kernel void @v3i32_kernel_preload_arg(ptr addrspace(1) nocapture % ; GFX90a-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-8-LABEL: v3i32_kernel_preload_arg: +; GFX90a-PRELOAD-4-LABEL: v3i32_preload_arg: +; GFX90a-PRELOAD-4: ; %bb.0: +; GFX90a-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v3, 0 +; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s0 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s1 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s2 +; GFX90a-PRELOAD-4-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; GFX90a-PRELOAD-4-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: v3i32_preload_arg: ; GFX90a-PRELOAD-8: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 ; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 @@ -867,8 +1407,8 @@ define amdgpu_kernel void @v3i32_kernel_preload_arg(ptr addrspace(1) nocapture % ret void } -define amdgpu_kernel void @v3f32_kernel_preload_arg(ptr addrspace(1) nocapture %out, <3 x float> %in) #0 { -; GFX940-NO-PRELOAD-LABEL: v3f32_kernel_preload_arg: +define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture %out, <3 x float> %in) #0 { +; GFX940-NO-PRELOAD-LABEL: v3f32_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -880,7 +1420,19 @@ define amdgpu_kernel void @v3f32_kernel_preload_arg(ptr addrspace(1) nocapture % ; GFX940-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-2-LABEL: v3f32_kernel_preload_arg: +; GFX940-PRELOAD-1-LABEL: v3f32_preload_arg: +; GFX940-PRELOAD-1: ; %bb.0: +; GFX940-PRELOAD-1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-PRELOAD-1-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 +; GFX940-PRELOAD-1-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: v3f32_preload_arg: ; GFX940-PRELOAD-2: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -892,7 +1444,19 @@ define amdgpu_kernel void @v3f32_kernel_preload_arg(ptr addrspace(1) nocapture % ; GFX940-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-8-LABEL: v3f32_kernel_preload_arg: +; GFX940-PRELOAD-4-LABEL: v3f32_preload_arg: +; GFX940-PRELOAD-4: ; %bb.0: +; GFX940-PRELOAD-4-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-PRELOAD-4-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 +; GFX940-PRELOAD-4-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: v3f32_preload_arg: ; GFX940-PRELOAD-8: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -904,7 +1468,7 @@ define amdgpu_kernel void @v3f32_kernel_preload_arg(ptr addrspace(1) nocapture % ; GFX940-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: v3f32_kernel_preload_arg: +; GFX90a-NO-PRELOAD-LABEL: v3f32_preload_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 @@ -916,7 +1480,19 @@ define amdgpu_kernel void @v3f32_kernel_preload_arg(ptr addrspace(1) nocapture % ; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-2-LABEL: v3f32_kernel_preload_arg: +; GFX90a-PRELOAD-1-LABEL: v3f32_preload_arg: +; GFX90a-PRELOAD-1: ; %bb.0: +; GFX90a-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v3, 0 +; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s0 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s1 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s2 +; GFX90a-PRELOAD-1-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; GFX90a-PRELOAD-1-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: v3f32_preload_arg: ; GFX90a-PRELOAD-2: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 ; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 @@ -928,7 +1504,19 @@ define amdgpu_kernel void @v3f32_kernel_preload_arg(ptr addrspace(1) nocapture % ; GFX90a-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-8-LABEL: v3f32_kernel_preload_arg: +; GFX90a-PRELOAD-4-LABEL: v3f32_preload_arg: +; GFX90a-PRELOAD-4: ; %bb.0: +; GFX90a-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v3, 0 +; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s0 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s1 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s2 +; GFX90a-PRELOAD-4-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; GFX90a-PRELOAD-4-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: v3f32_preload_arg: ; GFX90a-PRELOAD-8: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 ; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 @@ -943,8 +1531,8 @@ define amdgpu_kernel void @v3f32_kernel_preload_arg(ptr addrspace(1) nocapture % ret void } -define amdgpu_kernel void @v5i8_kernel_preload_arg(ptr addrspace(1) nocapture %out, <5 x i8> %in) #0 { -; GFX940-NO-PRELOAD-LABEL: v5i8_kernel_preload_arg: +define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture %out, <5 x i8> %in) #0 { +; GFX940-NO-PRELOAD-LABEL: v5i8_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 @@ -955,7 +1543,18 @@ define amdgpu_kernel void @v5i8_kernel_preload_arg(ptr addrspace(1) nocapture %o ; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-2-LABEL: v5i8_kernel_preload_arg: +; GFX940-PRELOAD-1-LABEL: v5i8_preload_arg: +; GFX940-PRELOAD-1: ; %bb.0: +; GFX940-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s2 +; GFX940-PRELOAD-1-NEXT: global_store_byte v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-PRELOAD-1-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: v5i8_preload_arg: ; GFX940-PRELOAD-2: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 @@ -966,7 +1565,18 @@ define amdgpu_kernel void @v5i8_kernel_preload_arg(ptr addrspace(1) nocapture %o ; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-8-LABEL: v5i8_kernel_preload_arg: +; GFX940-PRELOAD-4-LABEL: v5i8_preload_arg: +; GFX940-PRELOAD-4: ; %bb.0: +; GFX940-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s2 +; GFX940-PRELOAD-4-NEXT: global_store_byte v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-PRELOAD-4-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: v5i8_preload_arg: ; GFX940-PRELOAD-8: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 @@ -977,7 +1587,7 @@ define amdgpu_kernel void @v5i8_kernel_preload_arg(ptr addrspace(1) nocapture %o ; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: v5i8_kernel_preload_arg: +; GFX90a-NO-PRELOAD-LABEL: v5i8_preload_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 @@ -988,7 +1598,18 @@ define amdgpu_kernel void @v5i8_kernel_preload_arg(ptr addrspace(1) nocapture %o ; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[0:1] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-2-LABEL: v5i8_kernel_preload_arg: +; GFX90a-PRELOAD-1-LABEL: v5i8_preload_arg: +; GFX90a-PRELOAD-1: ; %bb.0: +; GFX90a-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s3 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s2 +; GFX90a-PRELOAD-1-NEXT: global_store_byte v0, v1, s[0:1] offset:4 +; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v2, s[0:1] +; GFX90a-PRELOAD-1-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: v5i8_preload_arg: ; GFX90a-PRELOAD-2: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 @@ -999,7 +1620,18 @@ define amdgpu_kernel void @v5i8_kernel_preload_arg(ptr addrspace(1) nocapture %o ; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v2, s[0:1] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-8-LABEL: v5i8_kernel_preload_arg: +; GFX90a-PRELOAD-4-LABEL: v5i8_preload_arg: +; GFX90a-PRELOAD-4: ; %bb.0: +; GFX90a-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s3 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s2 +; GFX90a-PRELOAD-4-NEXT: global_store_byte v0, v1, s[0:1] offset:4 +; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v2, s[0:1] +; GFX90a-PRELOAD-4-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: v5i8_preload_arg: ; GFX90a-PRELOAD-8: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 @@ -1013,8 +1645,8 @@ define amdgpu_kernel void @v5i8_kernel_preload_arg(ptr addrspace(1) nocapture %o ret void } -define amdgpu_kernel void @v5f64_kernel_preload_arg(ptr addrspace(1) nocapture %out, <5 x double> %in) #0 { -; GFX940-NO-PRELOAD-LABEL: v5f64_kernel_preload_arg: +define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x double> %in) #0 { +; GFX940-NO-PRELOAD-LABEL: v5f64_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x60 ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x40 @@ -1036,7 +1668,29 @@ define amdgpu_kernel void @v5f64_kernel_preload_arg(ptr addrspace(1) nocapture % ; GFX940-NO-PRELOAD-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-2-LABEL: v5f64_kernel_preload_arg: +; GFX940-PRELOAD-1-LABEL: v5f64_arg: +; GFX940-PRELOAD-1: ; %bb.0: +; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x60 +; GFX940-PRELOAD-1-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x40 +; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x0 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-1-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s8 +; GFX940-PRELOAD-1-NEXT: global_store_dwordx2 v4, v[2:3], s[12:13] offset:32 sc0 sc1 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s9 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s10 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v3, s11 +; GFX940-PRELOAD-1-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16 sc0 sc1 +; GFX940-PRELOAD-1-NEXT: s_nop 1 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v3, s7 +; GFX940-PRELOAD-1-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] sc0 sc1 +; GFX940-PRELOAD-1-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: v5f64_arg: ; GFX940-PRELOAD-2: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x60 ; GFX940-PRELOAD-2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x40 @@ -1058,7 +1712,29 @@ define amdgpu_kernel void @v5f64_kernel_preload_arg(ptr addrspace(1) nocapture % ; GFX940-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-8-LABEL: v5f64_kernel_preload_arg: +; GFX940-PRELOAD-4-LABEL: v5f64_arg: +; GFX940-PRELOAD-4: ; %bb.0: +; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x60 +; GFX940-PRELOAD-4-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x40 +; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x0 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-4-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s8 +; GFX940-PRELOAD-4-NEXT: global_store_dwordx2 v4, v[2:3], s[12:13] offset:32 sc0 sc1 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s9 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s10 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v3, s11 +; GFX940-PRELOAD-4-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16 sc0 sc1 +; GFX940-PRELOAD-4-NEXT: s_nop 1 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v3, s7 +; GFX940-PRELOAD-4-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] sc0 sc1 +; GFX940-PRELOAD-4-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: v5f64_arg: ; GFX940-PRELOAD-8: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x60 ; GFX940-PRELOAD-8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x40 @@ -1080,7 +1756,7 @@ define amdgpu_kernel void @v5f64_kernel_preload_arg(ptr addrspace(1) nocapture % ; GFX940-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: v5f64_kernel_preload_arg: +; GFX90a-NO-PRELOAD-LABEL: v5f64_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60 ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 @@ -1102,7 +1778,29 @@ define amdgpu_kernel void @v5f64_kernel_preload_arg(ptr addrspace(1) nocapture % ; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-2-LABEL: v5f64_kernel_preload_arg: +; GFX90a-PRELOAD-1-LABEL: v5f64_arg: +; GFX90a-PRELOAD-1: ; %bb.0: +; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60 +; GFX90a-PRELOAD-1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 +; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v4, 0 +; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-1-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s12 +; GFX90a-PRELOAD-1-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s13 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s14 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v3, s15 +; GFX90a-PRELOAD-1-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 +; GFX90a-PRELOAD-1-NEXT: s_nop 0 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s8 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s9 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s10 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v3, s11 +; GFX90a-PRELOAD-1-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX90a-PRELOAD-1-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: v5f64_arg: ; GFX90a-PRELOAD-2: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60 ; GFX90a-PRELOAD-2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 @@ -1124,7 +1822,29 @@ define amdgpu_kernel void @v5f64_kernel_preload_arg(ptr addrspace(1) nocapture % ; GFX90a-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-8-LABEL: v5f64_kernel_preload_arg: +; GFX90a-PRELOAD-4-LABEL: v5f64_arg: +; GFX90a-PRELOAD-4: ; %bb.0: +; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60 +; GFX90a-PRELOAD-4-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 +; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v4, 0 +; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-4-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s12 +; GFX90a-PRELOAD-4-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s13 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s14 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v3, s15 +; GFX90a-PRELOAD-4-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 +; GFX90a-PRELOAD-4-NEXT: s_nop 0 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s8 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s9 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s10 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v3, s11 +; GFX90a-PRELOAD-4-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX90a-PRELOAD-4-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: v5f64_arg: ; GFX90a-PRELOAD-8: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60 ; GFX90a-PRELOAD-8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 @@ -1149,8 +1869,8 @@ define amdgpu_kernel void @v5f64_kernel_preload_arg(ptr addrspace(1) nocapture % ret void } -define amdgpu_kernel void @v8i8_kernel_preload_arg(ptr addrspace(1) %out, <8 x i8> %in) #0 { -; GFX940-NO-PRELOAD-LABEL: v8i8_kernel_preload_arg: +define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) %out, <8 x i8> %in) #0 { +; GFX940-NO-PRELOAD-LABEL: v8i8_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, 0 @@ -1159,7 +1879,16 @@ define amdgpu_kernel void @v8i8_kernel_preload_arg(ptr addrspace(1) %out, <8 x i ; GFX940-NO-PRELOAD-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-2-LABEL: v8i8_kernel_preload_arg: +; GFX940-PRELOAD-1-LABEL: v8i8_preload_arg: +; GFX940-PRELOAD-1: ; %bb.0: +; GFX940-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-1-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-PRELOAD-1-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-PRELOAD-1-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: v8i8_preload_arg: ; GFX940-PRELOAD-2: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0 @@ -1168,7 +1897,16 @@ define amdgpu_kernel void @v8i8_kernel_preload_arg(ptr addrspace(1) %out, <8 x i ; GFX940-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-8-LABEL: v8i8_kernel_preload_arg: +; GFX940-PRELOAD-4-LABEL: v8i8_preload_arg: +; GFX940-PRELOAD-4: ; %bb.0: +; GFX940-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-4-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-PRELOAD-4-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-PRELOAD-4-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: v8i8_preload_arg: ; GFX940-PRELOAD-8: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0 @@ -1177,7 +1915,7 @@ define amdgpu_kernel void @v8i8_kernel_preload_arg(ptr addrspace(1) %out, <8 x i ; GFX940-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: v8i8_kernel_preload_arg: +; GFX90a-NO-PRELOAD-LABEL: v8i8_preload_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, 0 @@ -1186,7 +1924,16 @@ define amdgpu_kernel void @v8i8_kernel_preload_arg(ptr addrspace(1) %out, <8 x i ; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-2-LABEL: v8i8_kernel_preload_arg: +; GFX90a-PRELOAD-1-LABEL: v8i8_preload_arg: +; GFX90a-PRELOAD-1: ; %bb.0: +; GFX90a-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, 0 +; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-1-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90a-PRELOAD-1-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX90a-PRELOAD-1-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: v8i8_preload_arg: ; GFX90a-PRELOAD-2: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0 @@ -1195,7 +1942,16 @@ define amdgpu_kernel void @v8i8_kernel_preload_arg(ptr addrspace(1) %out, <8 x i ; GFX90a-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-8-LABEL: v8i8_kernel_preload_arg: +; GFX90a-PRELOAD-4-LABEL: v8i8_preload_arg: +; GFX90a-PRELOAD-4: ; %bb.0: +; GFX90a-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, 0 +; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-4-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90a-PRELOAD-4-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX90a-PRELOAD-4-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: v8i8_preload_arg: ; GFX90a-PRELOAD-8: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0 @@ -1218,6 +1974,16 @@ define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) %out, i64 %a) ; GFX940-NO-PRELOAD-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; +; GFX940-PRELOAD-1-LABEL: i64_kernel_preload_arg: +; GFX940-PRELOAD-1: ; %bb.0: +; GFX940-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-PRELOAD-1-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-PRELOAD-1-NEXT: s_endpgm +; ; GFX940-PRELOAD-2-LABEL: i64_kernel_preload_arg: ; GFX940-PRELOAD-2: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 @@ -1228,6 +1994,16 @@ define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) %out, i64 %a) ; GFX940-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; +; GFX940-PRELOAD-4-LABEL: i64_kernel_preload_arg: +; GFX940-PRELOAD-4: ; %bb.0: +; GFX940-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-PRELOAD-4-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-PRELOAD-4-NEXT: s_endpgm +; ; GFX940-PRELOAD-8-LABEL: i64_kernel_preload_arg: ; GFX940-PRELOAD-8: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 @@ -1248,6 +2024,16 @@ define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) %out, i64 %a) ; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; +; GFX90a-PRELOAD-1-LABEL: i64_kernel_preload_arg: +; GFX90a-PRELOAD-1: ; %bb.0: +; GFX90a-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, 0 +; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s2 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s3 +; GFX90a-PRELOAD-1-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX90a-PRELOAD-1-NEXT: s_endpgm +; ; GFX90a-PRELOAD-2-LABEL: i64_kernel_preload_arg: ; GFX90a-PRELOAD-2: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 @@ -1258,6 +2044,16 @@ define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) %out, i64 %a) ; GFX90a-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; +; GFX90a-PRELOAD-4-LABEL: i64_kernel_preload_arg: +; GFX90a-PRELOAD-4: ; %bb.0: +; GFX90a-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, 0 +; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s2 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s3 +; GFX90a-PRELOAD-4-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX90a-PRELOAD-4-NEXT: s_endpgm +; ; GFX90a-PRELOAD-8-LABEL: i64_kernel_preload_arg: ; GFX90a-PRELOAD-8: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 @@ -1282,6 +2078,16 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) %out, double ; GFX940-NO-PRELOAD-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; +; GFX940-PRELOAD-1-LABEL: f64_kernel_preload_arg: +; GFX940-PRELOAD-1: ; %bb.0: +; GFX940-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-PRELOAD-1-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-PRELOAD-1-NEXT: s_endpgm +; ; GFX940-PRELOAD-2-LABEL: f64_kernel_preload_arg: ; GFX940-PRELOAD-2: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 @@ -1292,6 +2098,16 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) %out, double ; GFX940-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; +; GFX940-PRELOAD-4-LABEL: f64_kernel_preload_arg: +; GFX940-PRELOAD-4: ; %bb.0: +; GFX940-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-PRELOAD-4-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-PRELOAD-4-NEXT: s_endpgm +; ; GFX940-PRELOAD-8-LABEL: f64_kernel_preload_arg: ; GFX940-PRELOAD-8: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 @@ -1312,6 +2128,16 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) %out, double ; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; +; GFX90a-PRELOAD-1-LABEL: f64_kernel_preload_arg: +; GFX90a-PRELOAD-1: ; %bb.0: +; GFX90a-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, 0 +; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s2 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s3 +; GFX90a-PRELOAD-1-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX90a-PRELOAD-1-NEXT: s_endpgm +; ; GFX90a-PRELOAD-2-LABEL: f64_kernel_preload_arg: ; GFX90a-PRELOAD-2: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 @@ -1322,6 +2148,16 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) %out, double ; GFX90a-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; +; GFX90a-PRELOAD-4-LABEL: f64_kernel_preload_arg: +; GFX90a-PRELOAD-4: ; %bb.0: +; GFX90a-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, 0 +; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s2 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s3 +; GFX90a-PRELOAD-4-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX90a-PRELOAD-4-NEXT: s_endpgm +; ; GFX90a-PRELOAD-8-LABEL: f64_kernel_preload_arg: ; GFX90a-PRELOAD-8: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 @@ -1335,1098 +2171,4 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) %out, double ret void } -define amdgpu_kernel void @half_kernel_preload_arg(ptr addrspace(1) %out, half %in) #0 { -; GFX940-NO-PRELOAD-LABEL: half_kernel_preload_arg: -; GFX940-NO-PRELOAD: ; %bb.0: -; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-NO-PRELOAD-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 -; GFX940-NO-PRELOAD-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: half_kernel_preload_arg: -; GFX940-PRELOAD-2: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-PRELOAD-2-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-2-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: half_kernel_preload_arg: -; GFX940-PRELOAD-8: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-PRELOAD-8-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-8-NEXT: s_endpgm -; -; GFX90a-NO-PRELOAD-LABEL: half_kernel_preload_arg: -; GFX90a-NO-PRELOAD: ; %bb.0: -; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1] -; GFX90a-NO-PRELOAD-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: half_kernel_preload_arg: -; GFX90a-PRELOAD-2: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[0:1] -; GFX90a-PRELOAD-2-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: half_kernel_preload_arg: -; GFX90a-PRELOAD-8: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-8-NEXT: global_store_short v0, v1, s[0:1] -; GFX90a-PRELOAD-8-NEXT: s_endpgm - store half %in, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @bfloat_kernel_preload_arg(ptr addrspace(1) %out, bfloat %in) #0 { -; GFX940-NO-PRELOAD-LABEL: bfloat_kernel_preload_arg: -; GFX940-NO-PRELOAD: ; %bb.0: -; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-NO-PRELOAD-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 -; GFX940-NO-PRELOAD-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: bfloat_kernel_preload_arg: -; GFX940-PRELOAD-2: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-PRELOAD-2-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-2-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: bfloat_kernel_preload_arg: -; GFX940-PRELOAD-8: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-PRELOAD-8-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-8-NEXT: s_endpgm -; -; GFX90a-NO-PRELOAD-LABEL: bfloat_kernel_preload_arg: -; GFX90a-NO-PRELOAD: ; %bb.0: -; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1] -; GFX90a-NO-PRELOAD-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: bfloat_kernel_preload_arg: -; GFX90a-PRELOAD-2: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[0:1] -; GFX90a-PRELOAD-2-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: bfloat_kernel_preload_arg: -; GFX90a-PRELOAD-8: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-8-NEXT: global_store_short v0, v1, s[0:1] -; GFX90a-PRELOAD-8-NEXT: s_endpgm - store bfloat %in, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v2bfloat_kernel_preload_arg(ptr addrspace(1) %out, <2 x bfloat> %in) #0 { -; GFX940-NO-PRELOAD-LABEL: v2bfloat_kernel_preload_arg: -; GFX940-NO-PRELOAD: ; %bb.0: -; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 -; GFX940-NO-PRELOAD-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: v2bfloat_kernel_preload_arg: -; GFX940-PRELOAD-2: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-2-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: v2bfloat_kernel_preload_arg: -; GFX940-PRELOAD-8: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-8-NEXT: s_endpgm -; -; GFX90a-NO-PRELOAD-LABEL: v2bfloat_kernel_preload_arg: -; GFX90a-NO-PRELOAD: ; %bb.0: -; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1] -; GFX90a-NO-PRELOAD-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: v2bfloat_kernel_preload_arg: -; GFX90a-PRELOAD-2: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[0:1] -; GFX90a-PRELOAD-2-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: v2bfloat_kernel_preload_arg: -; GFX90a-PRELOAD-8: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[0:1] -; GFX90a-PRELOAD-8-NEXT: s_endpgm - store <2 x bfloat> %in, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v3bfloat_kernel_preload_arg(ptr addrspace(1) %out, <3 x bfloat> %in) #0 { -; GFX940-NO-PRELOAD-LABEL: v3bfloat_kernel_preload_arg: -; GFX940-NO-PRELOAD: ; %bb.0: -; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s2 -; GFX940-NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 -; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 -; GFX940-NO-PRELOAD-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: v3bfloat_kernel_preload_arg: -; GFX940-PRELOAD-2: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s2 -; GFX940-PRELOAD-2-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 -; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 -; GFX940-PRELOAD-2-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: v3bfloat_kernel_preload_arg: -; GFX940-PRELOAD-8: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s2 -; GFX940-PRELOAD-8-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 -; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 -; GFX940-PRELOAD-8-NEXT: s_endpgm -; -; GFX90a-NO-PRELOAD-LABEL: v3bfloat_kernel_preload_arg: -; GFX90a-NO-PRELOAD: ; %bb.0: -; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s3 -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s2 -; GFX90a-NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1] offset:4 -; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[0:1] -; GFX90a-NO-PRELOAD-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: v3bfloat_kernel_preload_arg: -; GFX90a-PRELOAD-2: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s3 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s2 -; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[0:1] offset:4 -; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v2, s[0:1] -; GFX90a-PRELOAD-2-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: v3bfloat_kernel_preload_arg: -; GFX90a-PRELOAD-8: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s3 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s2 -; GFX90a-PRELOAD-8-NEXT: global_store_short v0, v1, s[0:1] offset:4 -; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v2, s[0:1] -; GFX90a-PRELOAD-8-NEXT: s_endpgm - store <3 x bfloat> %in, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v6bfloat_kernel_preload_arg(ptr addrspace(1) %out, <6 x bfloat> %in) #0 { -; GFX940-NO-PRELOAD-LABEL: v6bfloat_kernel_preload_arg: -; GFX940-NO-PRELOAD: ; %bb.0: -; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v3, 0 -; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s6 -; GFX940-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 -; GFX940-NO-PRELOAD-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: v6bfloat_kernel_preload_arg: -; GFX940-PRELOAD-2: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 -; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s6 -; GFX940-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 -; GFX940-PRELOAD-2-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: v6bfloat_kernel_preload_arg: -; GFX940-PRELOAD-8: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 -; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s6 -; GFX940-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 -; GFX940-PRELOAD-8-NEXT: s_endpgm -; -; GFX90a-NO-PRELOAD-LABEL: v6bfloat_kernel_preload_arg: -; GFX90a-NO-PRELOAD: ; %bb.0: -; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s0 -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s1 -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s2 -; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] -; GFX90a-NO-PRELOAD-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: v6bfloat_kernel_preload_arg: -; GFX90a-PRELOAD-2: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s0 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s1 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s2 -; GFX90a-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] -; GFX90a-PRELOAD-2-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: v6bfloat_kernel_preload_arg: -; GFX90a-PRELOAD-8: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s0 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s1 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s2 -; GFX90a-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] -; GFX90a-PRELOAD-8-NEXT: s_endpgm - store <6 x bfloat> %in, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @half_v7bfloat_kernel_preload_arg(ptr addrspace(1) %out, half %in, <7 x bfloat> %in2, ptr addrspace(1) %out2) #0 { -; GFX940-NO-PRELOAD-LABEL: half_v7bfloat_kernel_preload_arg: -; GFX940-NO-PRELOAD: ; %bb.0: -; GFX940-NO-PRELOAD-NEXT: s_load_dword s10, s[0:1], 0x8 -; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x20 -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v3, 0 -; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s10 -; GFX940-NO-PRELOAD-NEXT: global_store_short v3, v0, s[2:3] sc0 sc1 -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s7 -; GFX940-NO-PRELOAD-NEXT: global_store_short v3, v0, s[8:9] offset:12 sc0 sc1 -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s6 -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9] sc0 sc1 -; GFX940-NO-PRELOAD-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: half_v7bfloat_kernel_preload_arg: -; GFX940-PRELOAD-2: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dword s10, s[0:1], 0x8 -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x20 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 -; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s10 -; GFX940-PRELOAD-2-NEXT: global_store_short v3, v0, s[2:3] sc0 sc1 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s7 -; GFX940-PRELOAD-2-NEXT: global_store_short v3, v0, s[8:9] offset:12 sc0 sc1 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s6 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9] sc0 sc1 -; GFX940-PRELOAD-2-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: half_v7bfloat_kernel_preload_arg: -; GFX940-PRELOAD-8: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_load_dword s10, s[0:1], 0x8 -; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x20 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 -; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s10 -; GFX940-PRELOAD-8-NEXT: global_store_short v3, v0, s[2:3] sc0 sc1 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s7 -; GFX940-PRELOAD-8-NEXT: global_store_short v3, v0, s[8:9] offset:12 sc0 sc1 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s6 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9] sc0 sc1 -; GFX940-PRELOAD-8-NEXT: s_endpgm -; -; GFX90a-NO-PRELOAD-LABEL: half_v7bfloat_kernel_preload_arg: -; GFX90a-NO-PRELOAD: ; %bb.0: -; GFX90a-NO-PRELOAD-NEXT: s_load_dword s10, s[4:5], 0x8 -; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x20 -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s10 -; GFX90a-NO-PRELOAD-NEXT: global_store_short v3, v0, s[6:7] -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s3 -; GFX90a-NO-PRELOAD-NEXT: global_store_short v3, v0, s[8:9] offset:12 -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s2 -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s0 -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s1 -; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9] -; GFX90a-NO-PRELOAD-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: half_v7bfloat_kernel_preload_arg: -; GFX90a-PRELOAD-2: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_load_dword s10, s[4:5], 0x8 -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x20 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s10 -; GFX90a-PRELOAD-2-NEXT: global_store_short v3, v0, s[6:7] -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s3 -; GFX90a-PRELOAD-2-NEXT: global_store_short v3, v0, s[8:9] offset:12 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s2 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s0 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s1 -; GFX90a-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9] -; GFX90a-PRELOAD-2-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: half_v7bfloat_kernel_preload_arg: -; GFX90a-PRELOAD-8: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_load_dword s10, s[4:5], 0x8 -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x20 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s10 -; GFX90a-PRELOAD-8-NEXT: global_store_short v3, v0, s[6:7] -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s3 -; GFX90a-PRELOAD-8-NEXT: global_store_short v3, v0, s[8:9] offset:12 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s2 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s0 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s1 -; GFX90a-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9] -; GFX90a-PRELOAD-8-NEXT: s_endpgm - store half %in, ptr addrspace(1) %out - store <7 x bfloat> %in2, ptr addrspace(1) %out2 - ret void -} - -define amdgpu_kernel void @i1_kernel_preload_arg(ptr addrspace(1) %out, i1 %in) #0 { -; GFX940-NO-PRELOAD-LABEL: i1_kernel_preload_arg: -; GFX940-NO-PRELOAD: ; %bb.0: -; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NO-PRELOAD-NEXT: s_and_b32 s0, s4, 1 -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s0 -; GFX940-NO-PRELOAD-NEXT: global_store_byte v0, v1, s[2:3] sc0 sc1 -; GFX940-NO-PRELOAD-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: i1_kernel_preload_arg: -; GFX940-PRELOAD-2: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: s_and_b32 s0, s4, 1 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 -; GFX940-PRELOAD-2-NEXT: global_store_byte v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-2-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: i1_kernel_preload_arg: -; GFX940-PRELOAD-8: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-8-NEXT: s_and_b32 s0, s4, 1 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 -; GFX940-PRELOAD-8-NEXT: global_store_byte v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-8-NEXT: s_endpgm -; -; GFX90a-NO-PRELOAD-LABEL: i1_kernel_preload_arg: -; GFX90a-NO-PRELOAD: ; %bb.0: -; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NO-PRELOAD-NEXT: s_and_b32 s2, s2, 1 -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-NO-PRELOAD-NEXT: global_store_byte v0, v1, s[0:1] -; GFX90a-NO-PRELOAD-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: i1_kernel_preload_arg: -; GFX90a-PRELOAD-2: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-2-NEXT: s_and_b32 s2, s2, 1 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-2-NEXT: global_store_byte v0, v1, s[0:1] -; GFX90a-PRELOAD-2-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: i1_kernel_preload_arg: -; GFX90a-PRELOAD-8: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-8-NEXT: s_and_b32 s2, s2, 1 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-8-NEXT: global_store_byte v0, v1, s[0:1] -; GFX90a-PRELOAD-8-NEXT: s_endpgm - store i1 %in, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @fp128_kernel_preload_arg(ptr addrspace(1) %out, fp128 %in) #0 { -; GFX940-NO-PRELOAD-LABEL: fp128_kernel_preload_arg: -; GFX940-NO-PRELOAD: ; %bb.0: -; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NO-PRELOAD-NEXT: v_mov_b64_e32 v[0:1], s[4:5] -; GFX940-NO-PRELOAD-NEXT: v_mov_b64_e32 v[2:3], s[6:7] -; GFX940-NO-PRELOAD-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1 -; GFX940-NO-PRELOAD-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: fp128_kernel_preload_arg: -; GFX940-PRELOAD-2: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: v_mov_b64_e32 v[0:1], s[4:5] -; GFX940-PRELOAD-2-NEXT: v_mov_b64_e32 v[2:3], s[6:7] -; GFX940-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1 -; GFX940-PRELOAD-2-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: fp128_kernel_preload_arg: -; GFX940-PRELOAD-8: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-8-NEXT: v_mov_b64_e32 v[0:1], s[4:5] -; GFX940-PRELOAD-8-NEXT: v_mov_b64_e32 v[2:3], s[6:7] -; GFX940-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1 -; GFX940-PRELOAD-8-NEXT: s_endpgm -; -; GFX90a-NO-PRELOAD-LABEL: fp128_kernel_preload_arg: -; GFX90a-NO-PRELOAD: ; %bb.0: -; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v4, 0 -; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NO-PRELOAD-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90a-NO-PRELOAD-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] -; GFX90a-NO-PRELOAD-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: fp128_kernel_preload_arg: -; GFX90a-PRELOAD-2: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v4, 0 -; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-2-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90a-PRELOAD-2-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90a-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] -; GFX90a-PRELOAD-2-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: fp128_kernel_preload_arg: -; GFX90a-PRELOAD-8: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v4, 0 -; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-8-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90a-PRELOAD-8-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90a-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] -; GFX90a-PRELOAD-8-NEXT: s_endpgm - store fp128 %in, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v7i8_kernel_preload_arg(ptr addrspace(1) %out, <7 x i8> %in) #0 { -; GFX940-NO-PRELOAD-LABEL: v7i8_kernel_preload_arg: -; GFX940-NO-PRELOAD: ; %bb.0: -; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s2 -; GFX940-NO-PRELOAD-NEXT: global_store_byte_d16_hi v0, v1, s[0:1] offset:6 sc0 sc1 -; GFX940-NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 -; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 -; GFX940-NO-PRELOAD-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: v7i8_kernel_preload_arg: -; GFX940-PRELOAD-2: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s2 -; GFX940-PRELOAD-2-NEXT: global_store_byte_d16_hi v0, v1, s[0:1] offset:6 sc0 sc1 -; GFX940-PRELOAD-2-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 -; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 -; GFX940-PRELOAD-2-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: v7i8_kernel_preload_arg: -; GFX940-PRELOAD-8: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s2 -; GFX940-PRELOAD-8-NEXT: global_store_byte_d16_hi v0, v1, s[0:1] offset:6 sc0 sc1 -; GFX940-PRELOAD-8-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 -; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 -; GFX940-PRELOAD-8-NEXT: s_endpgm -; -; GFX90a-NO-PRELOAD-LABEL: v7i8_kernel_preload_arg: -; GFX90a-NO-PRELOAD: ; %bb.0: -; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s3 -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s2 -; GFX90a-NO-PRELOAD-NEXT: global_store_byte_d16_hi v0, v1, s[0:1] offset:6 -; GFX90a-NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1] offset:4 -; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[0:1] -; GFX90a-NO-PRELOAD-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: v7i8_kernel_preload_arg: -; GFX90a-PRELOAD-2: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s3 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s2 -; GFX90a-PRELOAD-2-NEXT: global_store_byte_d16_hi v0, v1, s[0:1] offset:6 -; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[0:1] offset:4 -; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v2, s[0:1] -; GFX90a-PRELOAD-2-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: v7i8_kernel_preload_arg: -; GFX90a-PRELOAD-8: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s3 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s2 -; GFX90a-PRELOAD-8-NEXT: global_store_byte_d16_hi v0, v1, s[0:1] offset:6 -; GFX90a-PRELOAD-8-NEXT: global_store_short v0, v1, s[0:1] offset:4 -; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v2, s[0:1] -; GFX90a-PRELOAD-8-NEXT: s_endpgm - store <7 x i8> %in, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v7half_kernel_preload_arg(ptr addrspace(1) %out, <7 x half> %in) #0 { -; GFX940-NO-PRELOAD-LABEL: v7half_kernel_preload_arg: -; GFX940-NO-PRELOAD: ; %bb.0: -; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v3, 0 -; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s7 -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s6 -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-NO-PRELOAD-NEXT: global_store_short v3, v1, s[2:3] offset:12 sc0 sc1 -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 -; GFX940-NO-PRELOAD-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: v7half_kernel_preload_arg: -; GFX940-PRELOAD-2: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 -; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s7 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s6 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-PRELOAD-2-NEXT: global_store_short v3, v1, s[2:3] offset:12 sc0 sc1 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 -; GFX940-PRELOAD-2-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: v7half_kernel_preload_arg: -; GFX940-PRELOAD-8: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 -; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s7 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s6 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-PRELOAD-8-NEXT: global_store_short v3, v1, s[2:3] offset:12 sc0 sc1 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 -; GFX940-PRELOAD-8-NEXT: s_endpgm -; -; GFX90a-NO-PRELOAD-LABEL: v7half_kernel_preload_arg: -; GFX90a-NO-PRELOAD: ; %bb.0: -; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s3 -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s2 -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s0 -; GFX90a-NO-PRELOAD-NEXT: global_store_short v3, v1, s[6:7] offset:12 -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s1 -; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] -; GFX90a-NO-PRELOAD-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: v7half_kernel_preload_arg: -; GFX90a-PRELOAD-2: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s3 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s2 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s0 -; GFX90a-PRELOAD-2-NEXT: global_store_short v3, v1, s[6:7] offset:12 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s1 -; GFX90a-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] -; GFX90a-PRELOAD-2-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: v7half_kernel_preload_arg: -; GFX90a-PRELOAD-8: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s3 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s2 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s0 -; GFX90a-PRELOAD-8-NEXT: global_store_short v3, v1, s[6:7] offset:12 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s1 -; GFX90a-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] -; GFX90a-PRELOAD-8-NEXT: s_endpgm - store <7 x half> %in, ptr addrspace(1) %out - ret void -} - -; Test when previous argument was not dword aligned. -define amdgpu_kernel void @i16_i32_kernel_preload_arg(ptr addrspace(1) %out, i16 %in, i32 %in2, ptr addrspace(1) %out2) #0 { -; GFX940-NO-PRELOAD-LABEL: i16_i32_kernel_preload_arg: -; GFX940-NO-PRELOAD: ; %bb.0: -; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 -; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x10 -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s6 -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s7 -; GFX940-NO-PRELOAD-NEXT: global_store_short v0, v1, s[4:5] sc0 sc1 -; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[2:3] sc0 sc1 -; GFX940-NO-PRELOAD-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: i16_i32_kernel_preload_arg: -; GFX940-PRELOAD-2: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x10 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s6 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s7 -; GFX940-PRELOAD-2-NEXT: global_store_short v0, v1, s[4:5] sc0 sc1 -; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v2, s[2:3] sc0 sc1 -; GFX940-PRELOAD-2-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: i16_i32_kernel_preload_arg: -; GFX940-PRELOAD-8: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 -; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x10 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s6 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s7 -; GFX940-PRELOAD-8-NEXT: global_store_short v0, v1, s[4:5] sc0 sc1 -; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v2, s[2:3] sc0 sc1 -; GFX940-PRELOAD-8-NEXT: s_endpgm -; -; GFX90a-NO-PRELOAD-LABEL: i16_i32_kernel_preload_arg: -; GFX90a-NO-PRELOAD: ; %bb.0: -; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s3 -; GFX90a-NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1] -; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[6:7] -; GFX90a-NO-PRELOAD-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: i16_i32_kernel_preload_arg: -; GFX90a-PRELOAD-2: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s3 -; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[0:1] -; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v2, s[6:7] -; GFX90a-PRELOAD-2-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: i16_i32_kernel_preload_arg: -; GFX90a-PRELOAD-8: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s3 -; GFX90a-PRELOAD-8-NEXT: global_store_short v0, v1, s[0:1] -; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v2, s[6:7] -; GFX90a-PRELOAD-8-NEXT: s_endpgm - store i16 %in, ptr addrspace(1) %out - store i32 %in2, ptr addrspace(1) %out2 - ret void -} - -define amdgpu_kernel void @i16_v3i32_kernel_preload_arg(ptr addrspace(1) %out, i16 %in, <3 x i32> %in2, ptr addrspace(1) %out2) #0 { -; GFX940-NO-PRELOAD-LABEL: i16_v3i32_kernel_preload_arg: -; GFX940-NO-PRELOAD: ; %bb.0: -; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NO-PRELOAD-NEXT: s_load_dword s7, s[0:1], 0x8 -; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x20 -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v3, 0 -; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v4, s7 -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s6 -; GFX940-NO-PRELOAD-NEXT: global_store_short v3, v4, s[2:3] sc0 sc1 -; GFX940-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9] sc0 sc1 -; GFX940-NO-PRELOAD-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: i16_v3i32_kernel_preload_arg: -; GFX940-PRELOAD-2: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: s_load_dword s7, s[0:1], 0x8 -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x20 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 -; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v4, s7 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s6 -; GFX940-PRELOAD-2-NEXT: global_store_short v3, v4, s[2:3] sc0 sc1 -; GFX940-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9] sc0 sc1 -; GFX940-PRELOAD-2-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: i16_v3i32_kernel_preload_arg: -; GFX940-PRELOAD-8: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-8-NEXT: s_load_dword s7, s[0:1], 0x8 -; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x20 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 -; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v4, s7 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s6 -; GFX940-PRELOAD-8-NEXT: global_store_short v3, v4, s[2:3] sc0 sc1 -; GFX940-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9] sc0 sc1 -; GFX940-PRELOAD-8-NEXT: s_endpgm -; -; GFX90a-NO-PRELOAD-LABEL: i16_v3i32_kernel_preload_arg: -; GFX90a-NO-PRELOAD: ; %bb.0: -; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NO-PRELOAD-NEXT: s_load_dword s3, s[4:5], 0x8 -; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x20 -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v4, s3 -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s0 -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s1 -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s2 -; GFX90a-NO-PRELOAD-NEXT: global_store_short v3, v4, s[6:7] -; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9] -; GFX90a-NO-PRELOAD-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: i16_v3i32_kernel_preload_arg: -; GFX90a-PRELOAD-2: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-2-NEXT: s_load_dword s3, s[4:5], 0x8 -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x20 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v4, s3 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s0 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s1 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s2 -; GFX90a-PRELOAD-2-NEXT: global_store_short v3, v4, s[6:7] -; GFX90a-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9] -; GFX90a-PRELOAD-2-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: i16_v3i32_kernel_preload_arg: -; GFX90a-PRELOAD-8: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-8-NEXT: s_load_dword s3, s[4:5], 0x8 -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x20 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v4, s3 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s0 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s1 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s2 -; GFX90a-PRELOAD-8-NEXT: global_store_short v3, v4, s[6:7] -; GFX90a-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9] -; GFX90a-PRELOAD-8-NEXT: s_endpgm - store i16 %in, ptr addrspace(1) %out - store <3 x i32> %in2, ptr addrspace(1) %out2 - ret void -} - -define amdgpu_kernel void @i16_i16_kernel_preload_arg(ptr addrspace(1) %out, i16 %in, i16 %in2, ptr addrspace(1) %out2) #0 { -; GFX940-NO-PRELOAD-LABEL: i16_i16_kernel_preload_arg: -; GFX940-NO-PRELOAD: ; %bb.0: -; GFX940-NO-PRELOAD-NEXT: s_load_dword s6, s[0:1], 0x8 -; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x10 -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s6 -; GFX940-NO-PRELOAD-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 -; GFX940-NO-PRELOAD-NEXT: global_store_short_d16_hi v0, v1, s[4:5] sc0 sc1 -; GFX940-NO-PRELOAD-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: i16_i16_kernel_preload_arg: -; GFX940-PRELOAD-2: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dword s6, s[0:1], 0x8 -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x10 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s6 -; GFX940-PRELOAD-2-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-2-NEXT: global_store_short_d16_hi v0, v1, s[4:5] sc0 sc1 -; GFX940-PRELOAD-2-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: i16_i16_kernel_preload_arg: -; GFX940-PRELOAD-8: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_load_dword s6, s[0:1], 0x8 -; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x10 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s6 -; GFX940-PRELOAD-8-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-8-NEXT: global_store_short_d16_hi v0, v1, s[4:5] sc0 sc1 -; GFX940-PRELOAD-8-NEXT: s_endpgm -; -; GFX90a-NO-PRELOAD-LABEL: i16_i16_kernel_preload_arg: -; GFX90a-NO-PRELOAD: ; %bb.0: -; GFX90a-NO-PRELOAD-NEXT: s_load_dword s6, s[4:5], 0x8 -; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x10 -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s6 -; GFX90a-NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1] -; GFX90a-NO-PRELOAD-NEXT: global_store_short_d16_hi v0, v1, s[2:3] -; GFX90a-NO-PRELOAD-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: i16_i16_kernel_preload_arg: -; GFX90a-PRELOAD-2: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_load_dword s6, s[4:5], 0x8 -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x10 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s6 -; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[0:1] -; GFX90a-PRELOAD-2-NEXT: global_store_short_d16_hi v0, v1, s[2:3] -; GFX90a-PRELOAD-2-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: i16_i16_kernel_preload_arg: -; GFX90a-PRELOAD-8: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_load_dword s6, s[4:5], 0x8 -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x10 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s6 -; GFX90a-PRELOAD-8-NEXT: global_store_short v0, v1, s[0:1] -; GFX90a-PRELOAD-8-NEXT: global_store_short_d16_hi v0, v1, s[2:3] -; GFX90a-PRELOAD-8-NEXT: s_endpgm - store i16 %in, ptr addrspace(1) %out - store i16 %in2, ptr addrspace(1) %out2 - ret void -} - -define amdgpu_kernel void @i16_v2i8_kernel_preload_arg(ptr addrspace(1) %out, i16 %in, <2 x i8> %in2, ptr addrspace(1) %out2) #0 { -; GFX940-NO-PRELOAD-LABEL: i16_v2i8_kernel_preload_arg: -; GFX940-NO-PRELOAD: ; %bb.0: -; GFX940-NO-PRELOAD-NEXT: s_load_dword s6, s[0:1], 0x8 -; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x10 -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s6 -; GFX940-NO-PRELOAD-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 -; GFX940-NO-PRELOAD-NEXT: global_store_short_d16_hi v0, v1, s[4:5] sc0 sc1 -; GFX940-NO-PRELOAD-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: i16_v2i8_kernel_preload_arg: -; GFX940-PRELOAD-2: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dword s6, s[0:1], 0x8 -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x10 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s6 -; GFX940-PRELOAD-2-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-2-NEXT: global_store_short_d16_hi v0, v1, s[4:5] sc0 sc1 -; GFX940-PRELOAD-2-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: i16_v2i8_kernel_preload_arg: -; GFX940-PRELOAD-8: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_load_dword s6, s[0:1], 0x8 -; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x10 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s6 -; GFX940-PRELOAD-8-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-8-NEXT: global_store_short_d16_hi v0, v1, s[4:5] sc0 sc1 -; GFX940-PRELOAD-8-NEXT: s_endpgm -; -; GFX90a-NO-PRELOAD-LABEL: i16_v2i8_kernel_preload_arg: -; GFX90a-NO-PRELOAD: ; %bb.0: -; GFX90a-NO-PRELOAD-NEXT: s_load_dword s6, s[4:5], 0x8 -; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x10 -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s6 -; GFX90a-NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1] -; GFX90a-NO-PRELOAD-NEXT: global_store_short_d16_hi v0, v1, s[2:3] -; GFX90a-NO-PRELOAD-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: i16_v2i8_kernel_preload_arg: -; GFX90a-PRELOAD-2: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_load_dword s6, s[4:5], 0x8 -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x10 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s6 -; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[0:1] -; GFX90a-PRELOAD-2-NEXT: global_store_short_d16_hi v0, v1, s[2:3] -; GFX90a-PRELOAD-2-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: i16_v2i8_kernel_preload_arg: -; GFX90a-PRELOAD-8: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_load_dword s6, s[4:5], 0x8 -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x10 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s6 -; GFX90a-PRELOAD-8-NEXT: global_store_short v0, v1, s[0:1] -; GFX90a-PRELOAD-8-NEXT: global_store_short_d16_hi v0, v1, s[2:3] -; GFX90a-PRELOAD-8-NEXT: s_endpgm - store i16 %in, ptr addrspace(1) %out - store <2 x i8> %in2, ptr addrspace(1) %out2 - ret void -} - -attributes #0 = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } +attributes #0 = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }